# List of required packages, including 'here'
required_packages <- c(
"plyr", "alr4", "caret", "car", "corrplot", "dplyr", "effects", "fastDummies", "ggplot2",
"GGally", "ggplot2", "ggpubr", "glmnet", "lmtest", "MASS", "ModelMetrics", "kableExtra",
"nortest", "olsrr", "onewaytests", "readr", "here", "stringr", "knitr", "reshape2", "leaflet",
"RColorBrewer", "scales", "purrr", "DT", "jsonlite", "magrittr", "rpart", "broom"
)
# Establish CRAN for package installs
options(repos = c(CRAN = "https://ftp.osuosl.org/pub/cran/")) # Set the CRAN mirror
# Check if each package is installed; if not, install it
for (pkg in required_packages) {
if (!(pkg %in% installed.packages()[,"Package"])) {
install.packages(pkg, dependencies = TRUE)
}
}
# Load all the packages without displaying masking warnings
lapply(required_packages, function(pkg) {
suppressMessages(library(pkg, character.only = TRUE))
})
# Build the full path to the directory containing the Rmd file
rmd_dir <- dirname(here())
# Navigate up one directory and then to the CSV data file
csv_file <- file.path(rmd_dir, "HomeValueForecaster", "KC_House_Sales.csv")
json_filepath <- file.path(rmd_dir, "HomeValueForecaster/Final Notebook", "model_parameters.json")
# Read the CSV file into a data frame
df <- read.csv(csv_file)
# Function to manually update the JSON file with model parameters
update_model_json <- function(model_name, features, filepath) {
# Read existing parameters if the file exists, or initialize an empty list
model_params <- if (file.exists(filepath)) {
fromJSON(filepath)
} else {
list()
}
# Update the parameters for the specified model
model_params[[model_name]] <- features
# Write the updated parameters back to the JSON file
write_json(model_params, filepath)
}
# Initialize the figure counter
fig_counter <- 0
table_counter <- 0
# Custom function to generate figure captions with automatic numbering
generate_figure_caption <- function(caption, section) {
fig_counter <<- fig_counter + 1
paste0("Figure ", section, ".", fig_counter, " ", caption)
}
# Custom function to generate figure captions with automatic numbering
generate_table_caption <- function(caption, section) {
table_counter <<- table_counter + 1
paste0("Table ", section, ".", table_counter, " ", caption)
}
The King County house sales dataset is a comprehensive collection of 21,613 observations, each representing a unique house sale. The dataset encompasses a variety of features that describe different aspects of the houses sold. Below is a detailed description of each variable in the dataset:
# Create a data frame for the table
data_description <- data.frame(
Variable = c(
'id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat',
'long', 'sqft_living15', 'sqft_lot15'
),
Description = c(
'Unique ID for each home sold (not used as a predictor)',
'Date of the home sale',
'Price of each home sold',
'Number of bedrooms',
'Number of bathrooms, ".5" accounts for a bathroom with a toilet but no shower',
'Square footage of the apartment interior living space',
'Square footage of the land space',
'Number of floors',
'A dummy variable for whether the apartment was overlooking the waterfront or not',
'An index from 0 to 4 of how good the view of the property was',
'An index from 1 to 5 on the condition of the apartment',
'An index from 1 to 13 about building construction and design quality',
'The square footage of the interior housing space above ground level',
'The square footage of the interior housing space below ground level',
'The year the house was initially built',
'The year of the house’s last renovation',
'The zipcode area the house is in',
'Latitude coordinate',
'Longitude coordinate',
'The square footage of interior housing living space for the nearest 15 neighbors',
'The square footage of the land lots of the nearest 15 neighbors'
)
)
# Create the table with kable
data_description_table <- kable(
data_description,
format = "html",
caption = generate_figure_caption(caption = "Data Description", section = 1)
) %>%
kable_styling(full_width = TRUE) %>%
column_spec(1, bold = TRUE)
# Print the table
data_description_table
| Variable | Description |
|---|---|
| id | Unique ID for each home sold (not used as a predictor) |
| date | Date of the home sale |
| price | Price of each home sold |
| bedrooms | Number of bedrooms |
| bathrooms | Number of bathrooms, “.5” accounts for a bathroom with a toilet but no shower |
| sqft_living | Square footage of the apartment interior living space |
| sqft_lot | Square footage of the land space |
| floors | Number of floors |
| waterfront | A dummy variable for whether the apartment was overlooking the waterfront or not |
| view | An index from 0 to 4 of how good the view of the property was |
| condition | An index from 1 to 5 on the condition of the apartment |
| grade | An index from 1 to 13 about building construction and design quality |
| sqft_above | The square footage of the interior housing space above ground level |
| sqft_basement | The square footage of the interior housing space below ground level |
| yr_built | The year the house was initially built |
| yr_renovated | The year of the house’s last renovation |
| zipcode | The zipcode area the house is in |
| lat | Latitude coordinate |
| long | Longitude coordinate |
| sqft_living15 | The square footage of interior housing living space for the nearest 15 neighbors |
| sqft_lot15 | The square footage of the land lots of the nearest 15 neighbors |
The dataset contains housing information for a total of 21,613 houses. The prices of these houses range from the minimum price of $0 to a maximum of $9.9 million. On average, the houses in this dataset have a price of approximately $540,000. The median price, which represents the middle value when all prices are arranged in ascending order, is $450,000. The most common price range falls within the first quartile, where houses have prices around $321,000 to $645,000. The dataset also includes information on various other factors, such as the number of bedrooms, bathrooms, square footage of living space, lot size, and more, all of which can impact house prices. Understanding the distribution and characteristics of house prices in this dataset is essential for any analysis or modeling task related to real estate.
# Data Preprocessing and Transformation
set.seed(1023) # Setting a seed for reproducibility
split_index <- sample(1:nrow(df), size = 0.7 * nrow(df))
train_df <- df[split_index, ]
test_df <- df[-split_index, ]
# Remove non-numeric characters from the 'price' column and convert it to numeric
df$price <- as.numeric(str_replace_all(df$price, "[^0-9.]", ""))
# Calculation of Convergence Point: Determine the convergence point for high-value homes
high_value_threshold <- quantile(df$price, probs = 0.95, na.rm = TRUE) # Calculate the high-value threshold
high_value_homes <- df[df$price >= high_value_threshold, ] # Select high-value homes
convergence_point <- c(mean(high_value_homes$lat, na.rm = TRUE), mean(high_value_homes$long, na.rm = TRUE))
# Remove non-numeric characters from the 'price' column and convert it to numeric
train_df$price <- as.numeric(str_replace_all(train_df$price, "[^0-9.]", ""))
test_df$price <- as.numeric(str_replace_all(test_df$price, "[^0-9.]", ""))
# Data Transformation Function with Distance Binning Option
transform_data <- function(df, convergence_point, linear_model) {
# Date Transformation: Convert the 'date' column to a Date object if present
if ("date" %in% colnames(df)) {
df$date <- as.Date(substr(as.character(df$date), 1, 8), format="%Y%m%d")
# Date-Time Feature Engineering: Extract various date-related features
df$year_sold <- lubridate::year(df$date)
df$month_sold <- lubridate::month(df$date)
df$day_sold <- lubridate::day(df$date)
df$season <- factor(lubridate::quarter(df$date), labels = c("Winter", "Spring", "Summer", "Fall"))
df$week_of_year <- lubridate::week(df$date)
df$day_of_year <- lubridate::yday(df$date)
}
# Creating Dummy Variables: Convert categorical variables into dummy variables
df <- df %>%
mutate(zipcode = as.factor(zipcode),
waterfront = as.factor(waterfront),
view = as.factor(view),
condition = as.factor(condition),
grade = as.numeric(grade),
grade = case_when(
grade %in% 1:3 ~ "Below_Average",
grade %in% 4:10 ~ "Average",
grade %in% 11:13 ~ "Above_Average")) %>%
dummy_cols(select_columns = c('zipcode', 'view', 'condition', 'grade', 'waterfront', 'season'))
# Remove last dummy variables to avoid multicollinearity
if (linear_model) {
df <- df[, !(names(df) %in% c("zipcode_98199", "view_0", "condition_1", "grade_13", "season_Winter", "waterfront_1"))]
}
# Haversine Distance Function: Calculate the distance between two points on Earth's surface
haversine_distance <- function(lat1, long1, lat2, long2) {
R <- 6371 # Earth radius in kilometers
delta_lat <- (lat2 - lat1) * pi / 180
delta_long <- (long2 - long1) * pi / 180
a <- sin(delta_lat/2)^2 + cos(lat1 * pi / 180) * cos(lat2 * pi / 180) * sin(delta_long/2)^2
c <- 2 * atan2(sqrt(a), sqrt(1 - a))
d <- R * c # Calculate the haversine distance
return(d)
}
# Calculate Haversine Distance
df$distance_to_convergence <- mapply(haversine_distance, df$lat, df$long,
MoreArgs = list(lat2 = convergence_point[1], long2 = convergence_point[2]))
# Remove columns that are no longer needed
df <- df[, !(names(df) %in% c("id", "date", "zipcode", "view", "condition", "grade", "waterfront", "season"))]
return(df)
}
# Applying the transformation function to training and test sets
train_df_linear <- transform_data(train_df, convergence_point, linear_model = TRUE) # Transform the training data for linear models
test_df_linear <- transform_data(test_df, convergence_point, linear_model = TRUE) # Transform the test data for linear models
train_df_non_linear <- transform_data(train_df, convergence_point, linear_model = FALSE) # Transform the training data
test_df_non_linear <- transform_data(test_df, convergence_point, linear_model = FALSE) # Transform the test data
# Set this to TRUE to update all the json model_parameters that are stored the JSON
# Check if the update_model_parameters is TRUE or not
update_model_parameters <- FALSE
# This updates the json with the parameters that were obtained from the intensive process of running
update_model_json <- function(model_name, features, filepath) {
model_params <- if (file.exists(filepath)) {
fromJSON(filepath)
} else {
list()
}
model_params[[model_name]] <- features
write_json(model_params, filepath)
}
The data preprocessing and transformation phase was crucial to prepare the dataset for accurate predictive analysis. This section outlines the key steps taken:
Exclusion of Non-Predictive Variables: The dataset
contained certain variables that were non-predictive in nature and
therefore not useful for our regression model. Specifically, the
id variable, serving as a unique identifier for each house
sale, was removed to prevent it from influencing house price
predictions. However, lat (latitude) and long
(longitude) were retained for their potential role in calculating
geographical distances, which could impact house prices.
Transformation of Data Types: To ensure consistency
and suitability for modeling, several variables underwent data type
transformation. Notably, the date variable, initially in
string format, was converted into a numeric format to facilitate its
incorporation into statistical models. Additionally, variables like
price, sqft_living, sqft_lot, and
others were converted to numeric formats.
Creation of Dummy Variables for Categorical Data:
Categorical variables such as waterfront,
view, condition, and grade were
transformed into dummy variables. This transformation was essential for
regression analysis, as it enabled the inclusion of non-numeric
variables in the model. The process involved converting these
categorical variables into binary variables (0 or 1). This was
particularly important for variables like waterfront, which
is inherently binary, and for ordinal variables like view
and condition, which possess an intrinsic order but needed
numerical representation for modeling.
Handling Special Cases in Variables: Variables like
bathrooms, which could have values like “0.5” to represent
bathrooms with a toilet but no shower, were retained in their original
form. These nuanced representations were preserved, as they carried
important information about the characteristics of the houses.
Grouping and Clustering of Variables: The
zipcode variable underwent transformation by extracting the
first three digits. This step reduced the number of dummy variables,
preventing model complexity while still capturing geographical
influences on house prices. Additionally, the grade
variable was clustered into broader categories to simplify the model and
focus on significant differences in construction and design quality.
Haversine Distance Calculation: A critical step was
the calculation of Haversine distances. This involved creating a
function to calculate the distance between two geographical points
represented by latitude and longitude coordinates. The calculated
haversine_distance was pivotal for understanding spatial
relationships and proximity to key locations that might affect house
prices.
Calculation of Convergence Point: A ‘convergence point’ was identified within the dataset, derived from houses with the highest values. This convergence point served as a reference to calculate the distance of each property from this central high-value location, potentially indicating a desirable area. Importantly, this step was executed on the training set alone to ensure the model accounted for locational desirability without data leakage.
| price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | sqft_above | sqft_basement | yr_built | yr_renovated | lat | long | sqft_living15 | sqft_lot15 | year_sold | month_sold | day_sold | week_of_year | day_of_year | zipcode_98001 | zipcode_98002 | zipcode_98003 | zipcode_98004 | zipcode_98005 | zipcode_98006 | zipcode_98007 | zipcode_98008 | zipcode_98010 | zipcode_98011 | zipcode_98014 | zipcode_98019 | zipcode_98022 | zipcode_98023 | zipcode_98024 | zipcode_98027 | zipcode_98028 | zipcode_98029 | zipcode_98030 | zipcode_98031 | zipcode_98032 | zipcode_98033 | zipcode_98034 | zipcode_98038 | zipcode_98039 | zipcode_98040 | zipcode_98042 | zipcode_98045 | zipcode_98052 | zipcode_98053 | zipcode_98055 | zipcode_98056 | zipcode_98058 | zipcode_98059 | zipcode_98065 | zipcode_98070 | zipcode_98072 | zipcode_98074 | zipcode_98075 | zipcode_98077 | zipcode_98092 | zipcode_98102 | zipcode_98103 | zipcode_98105 | zipcode_98106 | zipcode_98107 | zipcode_98108 | zipcode_98109 | zipcode_98112 | zipcode_98115 | zipcode_98116 | zipcode_98117 | zipcode_98118 | zipcode_98119 | zipcode_98122 | zipcode_98125 | zipcode_98126 | zipcode_98133 | zipcode_98136 | zipcode_98144 | zipcode_98146 | zipcode_98148 | zipcode_98155 | zipcode_98166 | zipcode_98168 | zipcode_98177 | zipcode_98178 | zipcode_98188 | zipcode_98198 | view_1 | view_2 | view_3 | view_4 | condition_2 | condition_3 | condition_4 | condition_5 | grade_Above_Average | grade_Average | grade_Below_Average | waterfront_0 | season_Spring | season_Summer | season_Fall | distance_to_convergence |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 890000 | 5 | 1.00 | 2590 | 4652 | 2 | 2310 | 280 | 1907 | 0 | 47.6038 | -122.294 | 2360 | 4650 | 2014 | 10 | 9 | 41 | 282 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 5.094525 |
| 355000 | 3 | 1.75 | 1960 | 7705 | 1 | 980 | 980 | 1950 | 0 | 47.5300 | -122.347 | 1380 | 4349 | 2014 | 9 | 23 | 38 | 266 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 13.117306 |
| 812500 | 4 | 2.75 | 2810 | 10300 | 1 | 1810 | 1000 | 1978 | 0 | 47.5626 | -122.149 | 2710 | 9900 | 2014 | 8 | 8 | 32 | 220 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 8.516639 |
| 290000 | 4 | 2.50 | 2000 | 13300 | 1 | 1200 | 800 | 1968 | 0 | 47.3530 | -122.294 | 1800 | 9810 | 2014 | 5 | 9 | 19 | 129 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 29.761115 |
| 410000 | 3 | 1.75 | 1660 | 6250 | 1 | 830 | 830 | 1980 | 0 | 47.5859 | -122.385 | 1660 | 5750 | 2014 | 7 | 9 | 28 | 190 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 12.202504 |
| 496000 | 3 | 2.50 | 2180 | 4533 | 2 | 2180 | 0 | 2010 | 0 | 47.7540 | -122.215 | 2180 | 7347 | 2014 | 11 | 17 | 46 | 321 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 15.268135 |
id Variableid variable, serving as a unique identifier for
each house sale, was removed from the dataset. This exclusion aimed to
prevent it from influencing house price predictions.lat (latitude)
and long (longitude) were retained in the dataset. These
geographic coordinates were preserved due to their potential role in
calculating geographical distances, which could significantly impact
house prices. While not directly predictive, they provide valuable
spatial information.date Variabledate variable, initially in string format,
underwent a critical transformation. It was converted into a numeric
format, allowing for easier incorporation into statistical models.
Numeric representations of dates are more amenable to various types of
analyses, including regression.price,
sqft_living, sqft_lot, and others, underwent
data type transformation. These variables were converted to numeric
formats to ensure consistency and suitability for modeling purposes.
Numeric representations are essential for performing mathematical
operations and statistical modeling.waterfront,
view, condition, and grade were
transformed into dummy variables. This transformation is pivotal for
regression analysis, as it allows these non-numeric variables to be
effectively included in the model.0
or 1). This binary representation is particularly important
for variables like waterfront, which is a binary indicator
itself, and for ordinal variables like view and
condition. This transformation preserves the inherent order
of these variables while making them suitable for numerical
modeling.bathrooms Variablebathrooms variable presented a unique challenge. It
contains values like "0.5" that represent bathrooms with a
toilet but no shower. In the code block, you can observe that we made a
conscious decision not to apply any transformation to this variable. By
doing so, we retained the nuanced details within the data. For instance,
these values indicate specific house characteristics that can influence
its price, such as the presence of a half-bathroom.zipcode Variablezipcode variable. This transformation involves extracting
the first three digits of the zip code. The purpose of this
transformation is twofold: it helps in reducing the number of dummy
variables, preventing the model from becoming overly complex, while
still retaining the essential geographical information. It allows us to
capture the broader regional influences on house prices without adding
excessive dimensions to the model.grade Variablehaversine_distance, is defined and applied to compute
distances between geographical points represented by latitude and
longitude. This step is critical for the model because it captures the
significance of geographical proximity. It helps the model understand
the spatial relationships and the impact of proximity to key locations
on house prices.add_model_performance FunctionThe add_model_performance function serves as a crucial
tool for evaluating regression models. It takes as input a model object
and conducts an evaluation using both training and testing datasets. The
function calculates key performance metrics, including Sum of Squared
Errors (SSE), R-squared, Root Mean Squared Error (RMSE), and Mean
Absolute Error (MAE) for both the training and test datasets. These
metrics provide valuable insights into the model’s predictive
capabilities. Additionally, it offers the flexibility to incorporate the
resulting performance metrics into an existing results dataframe or
create a new one if none is provided. This function streamlines the
evaluation process, making it easier to comprehensively compare and
assess different models.
view_model_results FunctionThe view_model_results function simplifies the task of
displaying model performance metrics in a user-friendly format.
Leveraging the datatable function, it generates an
interactive and visually appealing table for presenting model evaluation
results. Users can input a dataframe containing model performance
metrics, and this function will produce an informative table with
options for adding captions. This feature enhances the communication and
visualization of model performance, facilitating informed
decision-making when comparing various models.
extract_and_display_features FunctionThe extract_and_display_features function emphasizes
transparency and interpretability. Given a model object, it extracts and
presents the features utilized by the model for making predictions. This
transparency aids in understanding which variables influence the model’s
decisions. By displaying the relevant features, users gain insights into
the model’s feature selection process, enhancing model interpretability
and assisting in feature engineering efforts.
create_model FunctionThe create_model function simplifies the creation of
linear regression models based on user-defined feature subsets. Users
can specify a dataframe containing their data, the target variable, and
a list of features they wish to include in the model. The function
constructs the model’s formula and fits a linear regression model
accordingly. This flexibility empowers users to experiment with
different feature combinations and observe their impact on model
performance, aiding in the model selection process.
prepare_data FunctionThe prepare_data function streamlines the data
preparation process for model evaluation. It extracts features used in
the model, creates training and testing datasets without the target
variable, and organizes the data for model evaluation. By automating
these steps, this function ensures consistency and accuracy in the data
used for model assessment. Users can easily interface with the prepared
data, saving time and reducing the risk of errors during the evaluation
process.
evaluate_model FunctionThe evaluate_model function combines the elements of
model evaluation and result aggregation. Given a model, training and
testing datasets, and a target variable, it evaluates the model’s
performance and adds the resulting metrics to a results dataframe. Users
can specify a model name for identification purposes, facilitating easy
tracking of multiple model evaluations. This function streamlines the
process of comparing various models by accumulating their performance
metrics in a single dataframe, simplifying the decision-making process
when selecting the best model for a given task.
Collectively, these functions provide a comprehensive framework for efficiently evaluating and comparing regression models. They enhance transparency, simplify the evaluation process, and empower users to make informed decisions regarding model selection and feature engineering.
#' Add Model Performance to Dataframe
#'
#' This function evaluates a given regression model using training and testing datasets. It calculates
#' performance metrics like SSE, R-squared, RMSE, and MAE for both training and test sets, and then adds
#' these metrics to a provided results dataframe or creates one if not provided. The function ensures
#' that both training and testing datasets contain the same features used in the model before performing
#' predictions and calculations.
#'
#' @param model_name A string representing the name of the model.
#' @param model The model object to be evaluated.
#' @param x_train A dataframe containing the features of the training data.
#' @param y_train A vector containing the target variable of the training data.
#' @param x_test A dataframe containing the features of the test data.
#' @param y_test A vector containing the target variable of the test data.
#' @param df_results An optional dataframe where model performance metrics will be added (default: NULL).
#' @return The updated dataframe with the added model performance metrics.
#'
#' @examples
#' linear_model <- lm(price ~ ., data = train_df)
#' df_results <- add_model_performance("Linear Model", linear_model, x_train, y_train, x_test, y_test)
#'
#' # If df_results already exists and you want to add more results:
#' df_results <- add_model_performance("Another Model", another_model, x_train, y_train, x_test, y_test, df_results)
#'
#' @details
#' The function first extracts the features used in the model and checks if these features are present in
#' both the training and testing datasets. It then uses the model to predict the target variable on both
#' datasets and calculates performance metrics. These metrics are added to a dataframe that either exists
#' or is created within the function. This dataframe can be used for comparing different models' performances.
#'
#' It is assumed that the model is correctly specified with the appropriate features and the dataframes
#' provided to the function align with the model's structure.
add_model_performance <- function(model_name, model, x_train, y_train, x_test, y_test, df_results = NULL) {
# Create df_results if not provided
if (is.null(df_results)) {
df_results <- data.frame(
Model = character(),
SSE_train = double(),
SSE_test = double(),
R_squared_train = double(),
R_squared_test = double(),
RMSE_train = double(),
RMSE_test = double(),
MAE_train = double(),
MAE_test = double(),
stringsAsFactors = FALSE
)
}
y_hat_train <- predict(model, newdata = x_train)
y_hat_test <- predict(model, newdata = x_test)
# Performance metrics calculation
mae_train <- mean(abs(y_train - y_hat_train))
mae_test <- mean(abs(y_test - y_hat_test))
sse_train <- sum((y_train - y_hat_train)^2)
sse_test <- sum((y_test - y_hat_test)^2)
tss_train <- sum((y_train - mean(y_train))^2)
tss_test <- sum((y_test - mean(y_test))^2)
rsq_train <- 1 - (sse_train / tss_train)
rsq_test <- 1 - (sse_test / tss_test)
rmse_train <- sqrt(mean((y_train - y_hat_train)^2))
rmse_test <- sqrt(mean((y_test - y_hat_test)^2))
# Appending results to the dataframe
new_row <- data.frame(
Model = model_name,
SSE_train = sse_train,
SSE_test = sse_test,
R_squared_train = rsq_train,
R_squared_test = rsq_test,
RMSE_train = rmse_train,
RMSE_test = rmse_test,
MAE_train = mae_train,
MAE_test = mae_test
)
df_results <- rbind(df_results, new_row)
# Returning the updated dataframe
return(df_results)
}
#' Display Model Results using Datatable
#'
#' This function displays a dataframe containing model performance metrics using the `datatable` function.
#'
#' @param df_results A dataframe containing model performance metrics.
#' @param caption Optional caption for the table (default: "Model Comparison").
#' @return NULL (it displays the table but doesn't return a value).
view_model_results <- function(df_results, caption) {
# Identifying numeric columns other than "Model", "R_squared_train", and "R_squared_test"
cols_to_round <- setdiff(names(df_results[sapply(df_results, is.numeric)]), c("Model", "R_squared_train", "R_squared_test"))
# Round these specific columns to 2 decimal places
df_results[cols_to_round] <- lapply(df_results[cols_to_round], round, 2)
# Round R-squared columns to 4 decimal places
df_results$R_squared_train <- round(df_results$R_squared_train, 5)
df_results$R_squared_test <- round(df_results$R_squared_test, 5)
# Display the dataframe using datatable
datatable(
df_results,
caption = caption,
options = list(
paging = FALSE,
autoWidth = TRUE,
scrollX = TRUE,
fixedColumns = list(leftColumns = 1)
)
)
}
extract_and_display_features <- function(model, full_df, target_var) {
features_used <- setdiff(names(coef(model)), "(Intercept)")
features_used <- features_used[features_used != "(Intercept)"]
# Get all the feature names from the full dataset, excluding the target variable
all_features <- names(full_df)
all_features <- all_features[all_features != target_var]
# Identify features not used in the model and print if desired
# unused_features <- setdiff(all_features, features_used)
# Return the features that were used as a character vector
return(features_used)
}
# Function to create a linear model based on a subset of features
create_model <- function(df, target_var, features) {
# Construct the formula for lm()
formula <- as.formula(paste(target_var, "~", paste(features, collapse = "+")))
# Fit the linear model
return(lm(formula, data = df))
}
prepare_data <- function(model, train_df, test_df, target_var) {
# Extracting used features from the model
used_features <- extract_and_display_features(model, train_df, target_var = target_var)
# Create x_train and x_test without the target variable
x_train <- subset(train_df, select = used_features)
x_test <- subset(test_df, select = used_features)
y_train <- train_df[[target_var]]
y_test <- test_df[[target_var]]
return(list(x_train = x_train, x_test = x_test, y_train = y_train, y_test = y_test))
}
evaluate_model <- function(model_name, model, train_df, test_df, target_var, df_results) {
# Filter data into x and y subsets
data <- prepare_data(model, train_df, test_df, target_var = target_var)
# Adding model performance to the results dataframe
df_results <- add_model_performance(
model_name = model_name,
model = model,
x_train = data$x_train,
y_train = data$y_train,
x_test = data$x_test,
y_test = data$y_test,
df_results = df_results # Or pass existing df_results if available
)
return(df_results)
}
# Create an empty coefficients data frame if it doesn't exist, otherwise add rows to it
create_coefficients_df <- function(model, model_name = "Your Model Name", coefficients_df = NULL) {
# Check the type of model and extract coefficients accordingly
if (inherits(model, "lm")) {
coefficients <- coef(model)
} else if (inherits(model, "dgCMatrix")) { # If it's a matrix from glmnet
coefficients <- as.vector(model) # Convert matrix to vector
names(coefficients) <- rownames(model) # Use rownames from matrix as names
} else {
stop("Input model is not a recognized type.")
}
# Prepare the model data as a dataframe row
model_row <- data.frame(t(coefficients), check.names = FALSE)
model_row$Model_Name <- model_name # Add the model name as a new column
# If coefficients_df is not provided, initialize a new data frame
if (is.null(coefficients_df)) {
coefficients_df <- model_row
} else {
# Align the model_row with the coefficients_df
# Adding NA columns for missing features in coefficients_df
missing_cols_in_df <- setdiff(names(model_row), names(coefficients_df))
if (length(missing_cols_in_df) > 0) {
coefficients_df[missing_cols_in_df] <- NA
}
# Adding NA columns for missing features in model_row
missing_cols_in_row <- setdiff(names(coefficients_df), names(model_row))
if (length(missing_cols_in_row) > 0) {
model_row[missing_cols_in_row] <- NA
}
# Ensure both dataframes have the same column order
model_row <- model_row[names(coefficients_df)]
# Bind the new row to the existing dataframe
coefficients_df <- rbind(coefficients_df, model_row)
}
# Sort the columns alphabetically, except 'Model_Name'
cols_order <- c("Model_Name", sort(setdiff(names(coefficients_df), "Model_Name")))
coefficients_df <- coefficients_df[, cols_order]
return(coefficients_df)
}
# Fit a linear regression model to the training data
linear_model_initial <- lm(price ~ ., data = train_df_linear)
# Initalize and start a coefficients_df to examine later
coefficients_df <- create_coefficients_df(linear_model_initial, "Initial OLS Model")
# Evaluate OLS_linear
df_results <- evaluate_model("OLS_linear", linear_model_initial, train_df_linear, test_df_linear, target_var = 'price', NULL)
# Add results to the df_results to view and sort later
view_model_results(df_results, caption = generate_table_caption("OLS Linear Model Table", section = 4))
# Show inital linear model results
summary(linear_model_initial)
##
## Call:
## lm(formula = price ~ ., data = train_df_linear)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1415466 -62623 2607 59286 4245843
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.158e+07 1.366e+07 -4.509 6.56e-06 ***
## bedrooms -2.146e+04 1.736e+03 -12.363 < 2e-16 ***
## bathrooms 2.285e+04 2.994e+03 7.634 2.42e-14 ***
## sqft_living 1.350e+02 4.033e+00 33.483 < 2e-16 ***
## sqft_lot 3.055e-01 4.669e-02 6.542 6.25e-11 ***
## floors -3.145e+04 3.666e+03 -8.577 < 2e-16 ***
## sqft_above 7.569e+01 4.143e+00 18.268 < 2e-16 ***
## sqft_basement NA NA NA NA
## yr_built -9.304e+00 7.239e+01 -0.129 0.897732
## yr_renovated 3.122e+01 3.384e+00 9.224 < 2e-16 ***
## lat -1.572e+05 7.696e+04 -2.042 0.041156 *
## long 2.761e+05 6.335e+04 4.358 1.32e-05 ***
## sqft_living15 3.335e+01 3.253e+00 10.254 < 2e-16 ***
## sqft_lot15 -1.416e-01 6.931e-02 -2.043 0.041045 *
## year_sold 5.138e+04 5.139e+03 9.999 < 2e-16 ***
## month_sold 2.958e+05 6.753e+04 4.381 1.19e-05 ***
## day_sold 9.429e+03 2.220e+03 4.246 2.19e-05 ***
## week_of_year -3.651e+03 3.777e+03 -0.967 0.333807
## day_of_year -9.050e+03 2.281e+03 -3.968 7.29e-05 ***
## zipcode_98001 -2.903e+05 2.905e+04 -9.993 < 2e-16 ***
## zipcode_98002 -2.982e+05 3.078e+04 -9.689 < 2e-16 ***
## zipcode_98003 -2.774e+05 2.919e+04 -9.504 < 2e-16 ***
## zipcode_98004 1.999e+05 2.382e+04 8.391 < 2e-16 ***
## zipcode_98005 -2.460e+05 2.613e+04 -9.412 < 2e-16 ***
## zipcode_98006 -2.606e+05 2.426e+04 -10.740 < 2e-16 ***
## zipcode_98007 -2.842e+05 2.757e+04 -10.308 < 2e-16 ***
## zipcode_98008 -2.795e+05 2.568e+04 -10.885 < 2e-16 ***
## zipcode_98010 -2.756e+05 3.688e+04 -7.471 8.37e-14 ***
## zipcode_98011 -2.987e+05 2.175e+04 -13.733 < 2e-16 ***
## zipcode_98014 -2.992e+05 3.576e+04 -8.366 < 2e-16 ***
## zipcode_98019 -3.279e+05 2.996e+04 -10.945 < 2e-16 ***
## zipcode_98022 -2.498e+05 4.049e+04 -6.170 7.00e-10 ***
## zipcode_98023 -2.687e+05 2.926e+04 -9.186 < 2e-16 ***
## zipcode_98024 -3.047e+05 3.647e+04 -8.353 < 2e-16 ***
## zipcode_98027 -3.025e+05 2.727e+04 -11.092 < 2e-16 ***
## zipcode_98028 -2.869e+05 1.928e+04 -14.883 < 2e-16 ***
## zipcode_98029 -2.495e+05 2.855e+04 -8.737 < 2e-16 ***
## zipcode_98030 -3.644e+05 2.747e+04 -13.268 < 2e-16 ***
## zipcode_98031 -3.911e+05 2.589e+04 -15.105 < 2e-16 ***
## zipcode_98032 -3.441e+05 2.853e+04 -12.062 < 2e-16 ***
## zipcode_98033 -1.448e+05 2.097e+04 -6.902 5.32e-12 ***
## zipcode_98034 -2.591e+05 1.864e+04 -13.898 < 2e-16 ***
## zipcode_98038 -3.538e+05 3.043e+04 -11.629 < 2e-16 ***
## zipcode_98039 5.373e+05 3.368e+04 15.955 < 2e-16 ***
## zipcode_98040 -1.050e+04 2.302e+04 -0.456 0.648396
## zipcode_98042 -3.754e+05 2.818e+04 -13.324 < 2e-16 ***
## zipcode_98045 -2.583e+05 3.911e+04 -6.604 4.13e-11 ***
## zipcode_98052 -2.567e+05 2.252e+04 -11.398 < 2e-16 ***
## zipcode_98053 -2.794e+05 2.627e+04 -10.633 < 2e-16 ***
## zipcode_98055 -4.032e+05 2.388e+04 -16.884 < 2e-16 ***
## zipcode_98056 -4.069e+05 2.325e+04 -17.499 < 2e-16 ***
## zipcode_98058 -4.151e+05 2.501e+04 -16.597 < 2e-16 ***
## zipcode_98059 -4.181e+05 2.459e+04 -17.001 < 2e-16 ***
## zipcode_98065 -3.471e+05 3.369e+04 -10.303 < 2e-16 ***
## zipcode_98070 -2.809e+05 2.823e+04 -9.952 < 2e-16 ***
## zipcode_98072 -2.732e+05 2.283e+04 -11.968 < 2e-16 ***
## zipcode_98074 -3.050e+05 2.623e+04 -11.631 < 2e-16 ***
## zipcode_98075 -3.064e+05 2.769e+04 -11.065 < 2e-16 ***
## zipcode_98077 -3.164e+05 2.646e+04 -11.958 < 2e-16 ***
## zipcode_98092 -3.381e+05 3.071e+04 -11.008 < 2e-16 ***
## zipcode_98102 7.898e+04 2.148e+04 3.678 0.000236 ***
## zipcode_98103 -8.021e+04 1.369e+04 -5.858 4.79e-09 ***
## zipcode_98105 1.914e+04 1.872e+04 1.022 0.306613
## zipcode_98106 -3.009e+05 1.713e+04 -17.564 < 2e-16 ***
## zipcode_98107 -4.959e+04 1.548e+04 -3.202 0.001366 **
## zipcode_98108 -3.474e+05 2.044e+04 -16.993 < 2e-16 ***
## zipcode_98109 8.410e+04 2.097e+04 4.010 6.10e-05 ***
## zipcode_98112 1.733e+05 1.911e+04 9.070 < 2e-16 ***
## zipcode_98115 -1.165e+05 1.516e+04 -7.683 1.65e-14 ***
## zipcode_98116 -1.168e+05 1.538e+04 -7.590 3.39e-14 ***
## zipcode_98117 -6.259e+04 1.336e+04 -4.684 2.84e-06 ***
## zipcode_98118 -3.281e+05 1.923e+04 -17.059 < 2e-16 ***
## zipcode_98119 8.355e+04 1.696e+04 4.927 8.42e-07 ***
## zipcode_98122 -1.560e+05 1.896e+04 -8.226 < 2e-16 ***
## zipcode_98125 -2.224e+05 1.586e+04 -14.024 < 2e-16 ***
## zipcode_98126 -2.151e+05 1.652e+04 -13.019 < 2e-16 ***
## zipcode_98133 -2.045e+05 1.527e+04 -13.389 < 2e-16 ***
## zipcode_98136 -1.430e+05 1.750e+04 -8.170 3.33e-16 ***
## zipcode_98144 -2.205e+05 1.876e+04 -11.750 < 2e-16 ***
## zipcode_98146 -3.162e+05 1.884e+04 -16.785 < 2e-16 ***
## zipcode_98148 -3.159e+05 2.921e+04 -10.817 < 2e-16 ***
## zipcode_98155 -2.349e+05 1.666e+04 -14.097 < 2e-16 ***
## zipcode_98166 -3.213e+05 2.088e+04 -15.389 < 2e-16 ***
## zipcode_98168 -3.716e+05 2.049e+04 -18.136 < 2e-16 ***
## zipcode_98177 -1.182e+05 1.784e+04 -6.625 3.59e-11 ***
## zipcode_98178 -4.312e+05 2.213e+04 -19.482 < 2e-16 ***
## zipcode_98188 -3.848e+05 2.521e+04 -15.265 < 2e-16 ***
## zipcode_98198 -3.478e+05 2.444e+04 -14.230 < 2e-16 ***
## view_1 7.219e+04 1.037e+04 6.964 3.45e-12 ***
## view_2 7.069e+04 6.395e+03 11.054 < 2e-16 ***
## view_3 1.712e+05 8.963e+03 19.098 < 2e-16 ***
## view_4 3.517e+05 1.346e+04 26.128 < 2e-16 ***
## condition_2 1.179e+05 3.612e+04 3.263 0.001104 **
## condition_3 1.340e+05 3.326e+04 4.029 5.63e-05 ***
## condition_4 1.583e+05 3.327e+04 4.759 1.96e-06 ***
## condition_5 2.049e+05 3.347e+04 6.122 9.50e-10 ***
## grade_Above_Average 2.119e+05 9.063e+04 2.339 0.019372 *
## grade_Average -1.414e+05 9.005e+04 -1.570 0.116407
## grade_Below_Average NA NA NA NA
## waterfront_0 -5.148e+05 1.901e+04 -27.075 < 2e-16 ***
## season_Spring 1.012e+04 5.904e+03 1.714 0.086462 .
## season_Summer 1.257e+03 9.637e+03 0.130 0.896191
## season_Fall -9.601e+03 1.414e+04 -0.679 0.497116
## distance_to_convergence -1.010e+04 7.672e+02 -13.161 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 154600 on 15027 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 671.4 on 101 and 15027 DF, p-value: < 2.2e-16
In this section, we provide an in-depth evaluation of the initial linear regression model’s performance from a data modeling perspective in the context of predicting house prices.
The model exhibits a respectable fit to the data with an adjusted R-squared value of 0.8174. This metric suggests that approximately 81.74% of the variance in house prices is accounted for by the independent variables included in the model. While this is a strong start, it’s important to explore whether there is room for model improvement.
Several predictors stand out as statistically significant
contributors to the model’s predictive power. Notably, variables such as
bedrooms, bathrooms, sqft_living,
yr_renovated, lat, long, and
view have coefficients with p-values less than 0.05. These
variables have a substantial influence on predicting house prices and
are essential components of the model.
The (Intercept) term represents the baseline house price
when all other predictors are set to zero. It is crucial in
understanding the inherent value of a house. Interpretation of the
baseline price helps assess the model’s ability to capture the impact of
other variables.
The model also reveals challenges, such as missing coefficients for
some variables (e.g., sqft_basement), indicating potential
data quality issues. Addressing these gaps is vital to enhance the
model’s performance and interpretability.
Residual analysis indicates that the model’s residuals range from -1,415,466 to 4,245,843, suggesting the presence of heteroscedasticity or outliers. Deeper investigation into these issues is necessary to refine the model and ensure its robustness.
The model incorporates a wide range of features, including property characteristics, location-based attributes, and temporal variables. These features collectively contribute to its predictive capacity. Ongoing feature engineering efforts should focus on selecting relevant variables and transforming them effectively to improve the model’s accuracy.
While the initial model provides valuable insights, further optimization is warranted. Exploring alternative regression techniques, addressing multicollinearity among predictors, and employing feature selection methods can help enhance model performance.
We will commence with a comprehensive exploration of our dataset in Section 5, utilizing various visualization techniques to gain insights into the relationships between variables. This exploration includes scatter plots (Section 5.1), a correlation matrix (Section 5.2), and in-depth relationship analysis (Section 5.3).
In Section 6, we will employ a stepwise model selection approach to identify the most relevant predictors for our linear regression model. This process will be detailed in Section 6.1. Subsequently, we will present the results for the best linear model iterations in Sections 6.2 and 6.3, followed by a model comparison in Section 6.4.
Ensuring the validity of linear regression assumptions is vital, and we will conduct thorough checks in Section 7. This includes assessing the linearity assumption (Section 7.1), examining the normality of residuals (Section 7.2), and verifying homoscedasticity (constant variance) (Section 7.3).
In Section 8, we will address issues related to heteroscedasticity and multicollinearity. Initially, we will detect and analyze heteroscedasticity in Section 8.1, followed by the presentation of remedial measures in Section 8.2. Subsequently, Section 8.3 will focus on the detection of multicollinearity, with Section 8.4 offering solutions to mitigate its impact.
As we progress, we will explore alternative modeling techniques in Section 9. This includes the implementation of a regression tree model (Section 9.1), a neural network model (Section 9.2), and a support vector machine (SVM) model (Section 9.3). If applicable, we will also consider a logistic regression model (Section 9.4). These alternative models will provide valuable insights and potential enhancements to our predictive capabilities.
# Scatter plot of Price vs. Square Footage of Living Space
ggplot(data = train_df_non_linear, aes(x = sqft_living, y = price)) +
geom_point(pch = 20, col = "blue") +
labs(title = "Price vs. Square Footage of Living Space",
subtitle = "Seattle Housing Data",
x = "Sqft Living Space",
y = "Price",
caption = generate_figure_caption("Price vs. Square Footage of Living Space", section = 5))
In the scatter plot above, we compare the price of homes
against their sqft_living (square footage of interior
living space). This visualization allows us to explore the relationship
between these two variables.
# Distribution of Square Footage of Living Space
ggplot(data = train_df_non_linear, aes(x = sqft_living)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Sqft Living Space",
x = "Sqft Living Space",
y = "Density",
caption = generate_figure_caption("Distribution of Sqft Living Space", section = 5))
The histogram above displays the distribution of
sqft_living. It reveals that the variable is right-skewed,
with most homes having smaller living spaces and relatively fewer very
large living spaces.
# Scatter plot of Price vs. Square Footage of Lot
ggplot(data = train_df_non_linear, aes(x = sqft_lot, y = price)) +
geom_point(pch = 20, col = "green") +
labs(title = "Price vs. Sqft Lot",
x = "Sqft Lot",
y = "Price",
caption = generate_figure_caption("Price vs. Sqft Lot", section = 5))
The scatter plot above compares price against
sqft_lot (square footage of land space). It helps us
understand if there’s any relationship between the size of the lot and
the sale price.
# Distribution of Square Footage of Lot
ggplot(data = train_df_non_linear, aes(x = sqft_lot)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Sqft Lot",
x = "Sqft Lot",
y = "Density",
caption = generate_figure_caption("Distribution of Sqft Lot", section = 5))
The histogram above visualizes the distribution of
sqft_lot. Similar to sqft_living, this
variable is right-skewed, with most homes having smaller lot sizes and
relatively fewer very large lots.
# Scatter plot of Price vs. Square Footage Above Ground
ggplot(data = train_df_non_linear, aes(x = sqft_above, y = price)) +
geom_point(pch = 20, col = "red") +
labs(title = "Price vs. Sqft Above Ground",
x = "Sqft Above Ground",
y = "Price",
caption = generate_figure_caption("Price vs. Sqft Above Ground", section = 5))
In the scatter plot above, we compare price against
sqft_above (square footage of the interior housing space
above ground level). This analysis helps us explore the impact of
above-ground living space on home prices.
# Distribution of Square Footage Above Ground
ggplot(data = train_df_non_linear, aes(x = sqft_above)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Sqft Above Ground",
x = "Sqft Above Ground",
y = "Density",
caption = generate_figure_caption("Distribution of Sqft Above Ground", section = 5))
The histogram above shows the distribution of
sqft_above. It suggests that most homes have similar
above-ground square footage, with relatively fewer having significantly
larger or smaller above-ground spaces.
Excluding homes that do not have a basement.
# Scatter plot of Price vs. Square Footage of Basement (excluding 0 sqft basement)
# Filter data for non-zero sqft_basement
filtered_data <- train_df_non_linear[train_df_non_linear$sqft_basement > 0,]
# Scatter plot with custom caption
ggplot(data = filtered_data, aes(x = sqft_basement, y = price)) +
geom_point(pch = 20, col = "purple") +
labs(title = "Price vs. Sqft Basement",
x = "Sqft Basement",
y = "Price",
caption = generate_figure_caption("Price vs. Sqft Basement (Non-Zero Values)", section = 5))
The scatter plot above compares price against
sqft_basement (square footage of the interior housing space
below ground level). This visualization helps us understand if the
presence and size of a basement influence home prices.
# Distribution of Square Footage of Basement (excluding 0 values)
# Histogram with custom caption
ggplot(data = filtered_data, aes(x = sqft_basement)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Sqft Basement",
x = "Sqft Basement",
y = "Density",
caption = generate_figure_caption("Distribution of Sqft Basement (Non-Zero Values)", section = 5))
The histogram above visualizes the distribution of
sqft_basement. It indicates that most homes have little to
no basement space, while some have larger basement areas.
# Scatter plot of Price vs. Year Built
ggplot(data = train_df_non_linear, aes(x = yr_built, y = price)) +
geom_point(pch = 20, col = "orange") +
labs(title = "Price vs. Year Built",
x = "Year Built",
y = "Price",
caption = generate_figure_caption("Price vs. Year Built", section = 5))
The scatter plot above compares price against the year
when homes were initially built (yr_built). This analysis
helps us understand how the age of a home relates to its sale price.
# Distribution of Year Built
ggplot(data = train_df_non_linear, aes(x = yr_built)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Year Built",
x = "Year Built",
y = "Density",
caption = generate_figure_caption("Distribution of Year Built", section = 5))
The histogram above displays the distribution of
yr_built. It provides insights into the distribution of
home ages in the dataset.
Excluding homes that did not have a documented renovation.
# Find lowest non-zero year renovated
lowest_non_zero_renovation_year <- min(train_df_non_linear$yr_renovated[train_df_non_linear$yr_renovated > 0]) - 1
# Filter data for non-zero yr_renovated
filtered_data <- train_df_non_linear[train_df_non_linear$yr_renovated > 0,]
# Scatter plot of Price vs. Year Renovated
lowest_non_zero_renovation_year <- min(train_df_non_linear$yr_renovated[train_df_non_linear$yr_renovated > 0]) - 1
ggplot(data = filtered_data, aes(x = yr_renovated, y = price)) +
geom_point(pch = 20, col = "brown") +
labs(title = "Price vs. Year Renovated",
x = "Year Renovated",
y = "Price",
caption = generate_figure_caption("Price vs. Year Renovated (Non-Zero Values)", section = 5)) +
xlim(c(lowest_non_zero_renovation_year, max(train_df_non_linear$yr_renovated)))
In the scatter plot above, we compare price against the
year of the last renovation (yr_renovated). This analysis
helps us understand whether recent renovations impact home prices.
# Find lowest non-zero year renovated
lowest_non_zero_renovation_year <- min(train_df_non_linear$yr_renovated[train_df_non_linear$yr_renovated > 0]) - 1
# Filter data for non-zero yr_renovated
filtered_data <- train_df_non_linear[train_df_non_linear$yr_renovated > 0,]
# Histogram of Year Renovated
ggplot(data = filtered_data, aes(x = yr_renovated)) +
geom_histogram(fill = "orange") +
labs(title = "Histogram of Year Renovated",
x = "Year Renovated",
y = "Density",
caption = generate_figure_caption("Histogram of Year Renovated (Non-Zero Values)", section = 5)) +
xlim(c(lowest_non_zero_renovation_year, max(train_df_non_linear$yr_renovated)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The histogram above visualizes the distribution of
yr_renovated. It provides insights into the distribution of
renovation years in the dataset.
# Scatter plot of Price vs. Distance to Convergence
ggplot(data = train_df_non_linear, aes(x = distance_to_convergence, y = price)) +
geom_point(pch = 20, col = "violet") +
labs(title = "Price vs. Distance to Convergence",
x = "Distance to Convergence",
y = "Price",
caption = generate_figure_caption("Price vs. Distance to Convergence", section = 5))
The scatter plot above compares price against
distance_to_convergence. This analysis helps us explore
whether the distance to a convergence point impacts home prices.
# Distribution of Distance to Convergence
ggplot(data = train_df_non_linear, aes(x = distance_to_convergence)) +
geom_histogram(bins = 50) +
labs(title = "Distribution of Distance to Convergence",
x = "Distance to Convergence",
y = "Density",
caption = generate_figure_caption("Distribution of Distance to Convergence", section = 5))
The distribution and count of categorical variables such as
bedrooms, bathrooms, floors,
waterfront, view, condition, and
grade are analyzed.
# Convert bedrooms to factor
train_df_non_linear$bedrooms_factor <- factor(train_df_non_linear$bedrooms)
# Binned Boxplot of Price vs. Bedrooms
ggplot(data = train_df_non_linear, aes(x = bedrooms_factor, y = price)) +
geom_boxplot(fill = "blue") +
labs(title = "Price vs. Bedrooms",
x = "Bedrooms",
y = "Price",
caption = generate_figure_caption("Price vs. Bedrooms", section = 5))
The scatter plot above compares price against the number
of bedrooms. This visualization helps us understand how the
number of bedrooms influences home prices.
# Filter data excluding 33 bedrooms
filtered_bedrooms <- train_df_non_linear$bedrooms[train_df_non_linear$bedrooms != 33]
# Calculate frequencies of each bedroom count
bedroom_frequencies <- table(filtered_bedrooms)
ggplot(data = data.frame(filtered_bedrooms = as.factor(names(bedroom_frequencies)),
filtered_counts = as.numeric(bedroom_frequencies)),
aes(x = filtered_bedrooms, y = filtered_counts)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Distribution of Bedrooms (Excluding 33 Bedrooms)",
x = "Number of Bedrooms",
y = "Frequency",
caption = generate_figure_caption("Distribution of Bedrooms (Excluding 33 Bedrooms)", section = 5))
The bar plot above displays the distribution of the
bedrooms variable, showing the frequency of each bedroom
count.
# Convert bathrooms to factor
train_df_non_linear$bathrooms_factor <- factor(train_df_non_linear$bathrooms)
# Binned Boxplot of Price vs. Bathrooms
ggplot(data = train_df_non_linear, aes(x = bathrooms_factor, y = price)) +
geom_boxplot(fill = "green") +
labs(title = "Price vs. Bathrooms",
x = "Bathrooms",
y = "Price",
caption = generate_figure_caption("Price vs. Bathrooms", section = 5))
In the scatter plot above, we compare price against the
number of bathrooms. This analysis helps us explore the
relationship between the number of bathrooms and home prices.
# Get data for bar plot
bathrooms_counts <- table(train_df_non_linear$bathrooms)
bathrooms <- as.numeric(names(bathrooms_counts))
counts <- as.numeric(bathrooms_counts)
# Bar plot for the distribution of Bathrooms
ggplot(data = data.frame(bathrooms, counts), aes(x = bathrooms, y = counts)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Distribution of Bathrooms",
x = "Number of Bathrooms",
y = "Frequency",
caption = generate_figure_caption("Distribution of Bathrooms", section = 5))
The bar plot above visualizes the distribution of the
bathrooms variable, showing the frequency of each bathroom
count.
# Binned Boxplot of Price vs. Floors
ggplot(data = train_df_non_linear, aes(x = floors, y = price, group = floors)) +
geom_boxplot(fill = "orange") +
labs(title = "Price vs. Floors",
x = "Floors",
y = "Price",
caption = generate_figure_caption("Price vs. Floors", section = 5))
The scatter plot above compares price against the number
of floors. This analysis helps us understand how the number
of floors in a home relates to its sale price.
floors_counts <- table(train_df_non_linear$floors)
floors <- as.numeric(names(floors_counts))
counts <- as.numeric(floors_counts)
# Bar plot for the distribution of Floors
ggplot(data = data.frame(floors, counts), aes(x = floors, y = counts)) +
geom_bar(stat = "identity", fill = "orange") +
labs(title = "Distribution of Floors",
x = "Number of Floors",
y = "Frequency",
caption = generate_figure_caption("Distribution of Floors", section = 5))
The bar plot above displays the distribution of the
floors variable, showing the frequency of each floor
count.
ggplot(data = train_df_non_linear, aes(x = waterfront_1, y = price, group = waterfront_1)) +
geom_boxplot(fill = "purple") +
labs(title = "Price vs. Waterfront",
x = "Waterfront",
y = "Price",
caption = generate_figure_caption("Price vs. Waterfront", section = 5),
fill = "Waterfront",
levels = c("No", "Yes")) # Labels for waterfront status
In the scatter plot above, we compare price against the
waterfront variable. This visualization helps us explore
how having a waterfront view impacts home prices.
# Get data for bar plot
waterfront_counts <- table(train_df_non_linear$waterfront_1)
waterfront <- as.numeric(names(waterfront_counts))
counts <- as.numeric(waterfront_counts)
# Bar plot for the distribution of Waterfront
ggplot(data = data.frame(waterfront, counts), aes(x = waterfront, y = counts)) +
geom_bar(stat = "identity", fill = "purple") +
labs(title = "Distribution of Waterfront",
x = "Waterfront",
y = "Frequency",
caption = generate_figure_caption("Distribution of Waterfront", section = 5),
fill = "Waterfront",
levels = c("No", "Yes")) # Labels for waterfront status
The bar plot above visualizes the distribution of the
waterfront variable, showing the frequency of waterfront
and non-waterfront properties.
# Convert view categories from dummy variables to a factor for better labeling in ggplot
train_df_non_linear$view_category <- factor(apply(train_df_non_linear[, c("view_0", "view_1", "view_2", "view_3", "view_4")], 1, function(x) which(x == 1)),
labels = c("View 0", "View 1", "View 2", "View 3", "View 4"))
# Create the boxplot with ggplot2
ggplot(train_df_non_linear, aes(x = view_category, y = price)) +
geom_boxplot(fill = "brown") +
labs(title = "Price vs. View Quality",
x = "View Quality",
y = "Price",
caption = generate_figure_caption("Boxplot of Price vs. View Quality", section = 5))
The scatter plot above compares price against the
view variable, which represents the quality of the
property’s view. This analysis helps us explore how the view quality
impacts home prices.
# Calculate frequencies of each view quality rating
view_frequencies <- colSums(train_df_non_linear[, c("view_0", "view_1", "view_2", "view_3", "view_4")])
# Convert frequencies to data frame for ggplot2
view_df <- data.frame(View = names(view_frequencies), Frequency = view_frequencies)
# Create the bar plot with ggplot2
ggplot(view_df, aes(x = View, y = Frequency)) +
geom_bar(stat = "identity", fill = "purple") +
labs(title = "Distribution of View Quality",
x = "View Quality",
y = "Frequency",
caption = generate_figure_caption("Distribution of View Quality", section = 5))
The bar plot above displays the distribution of the view
variable, showing the frequency of different view quality ratings.
# Convert condition categories from dummy variables to a factor
train_df_non_linear$condition_category <- factor(apply(train_df_non_linear[, c("condition_1", "condition_2", "condition_3", "condition_4", "condition_5")], 1, function(x) which(x == 1)),
labels = c("Condition 1", "Condition 2", "Condition 3", "Condition 4", "Condition 5"))
# Create the boxplot with ggplot2
ggplot(train_df_non_linear, aes(x = condition_category, y = price)) +
geom_boxplot(fill = "blue") +
labs(title = "Price vs. Condition",
x = "Condition",
y = "Price",
caption = generate_figure_caption("Boxplot of Price vs. Condition", section = 5))
In the scatter plot above, we compare price against the
condition variable, which represents the condition of the
property. This analysis helps us explore how property condition relates
to home prices.
# Calculate frequencies of each condition rating
condition_frequencies <- colSums(train_df_non_linear[, c("condition_1", "condition_2", "condition_3", "condition_4", "condition_5")])
# Convert frequencies to a data frame for ggplot2
condition_df <- data.frame(Condition = names(condition_frequencies), Frequency = condition_frequencies)
# Create the bar plot with ggplot2
ggplot(condition_df, aes(x = Condition, y = Frequency)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Distribution of Condition",
x = "Condition Rating",
y = "Frequency",
caption = generate_figure_caption("Distribution of Condition", section = 5))
The bar plot above visualizes the distribution of the
condition variable, showing the frequency of different
condition ratings.
# First, identify all grade-related columns in the dataframe
grade_columns <- grep("grade_", names(train_df_non_linear), value = TRUE)
# Convert dummy variables back to a single categorical variable representing the grade
train_df_non_linear$grade_category <- apply(train_df_non_linear[, grade_columns], 1, function(row) {
if (all(is.na(row))) {
return(NA) # Return NA if all values in the row are NA
} else {
idx <- which(row == 1, arr.ind = TRUE)
return(if(length(idx) > 0) idx else NA) # Return the index of the grade, or NA if none is 1
}
})
# Extract grade labels from column names, replacing underscores with hyphens for better readability
grade_labels <- sub("grade_", "", grade_columns) # Remove 'grade_' prefix
grade_labels <- gsub("_", "-", grade_labels) # Replace underscores with hyphens
# Create a boxplot of Price vs. Grade
ggplot(train_df_non_linear, aes(x = factor(grade_category, labels = grade_labels), y = price)) +
geom_boxplot(fill = "green") +
labs(title = "Price vs. Grade",
x = "Grade",
y = "Price",
caption = generate_figure_caption("Boxplot of Price vs. Grade", section = 5))
The scatter plot above compares price against the
grade variable, which has been aggregated into categories
as per the provided header. This analysis helps us explore how the grade
of construction and design impacts home prices.
# Histogram for the Distribution of Grade
# Convert the grade category to a numeric variable for histogram plotting
train_df_non_linear$grade_category_numeric <- as.numeric(train_df_non_linear$grade_category)
# Define breaks for histogram
num_breaks <- length(unique(train_df_non_linear$grade_category_numeric, na.rm = TRUE))
hist_breaks <- seq(min(train_df_non_linear$grade_category_numeric, na.rm = TRUE) - 0.5,
max(train_df_non_linear$grade_category_numeric, na.rm = TRUE) + 0.5,
length.out = num_breaks + 1)
# Create a histogram with ggplot2
ggplot(train_df_non_linear, aes(x = grade_category_numeric)) +
geom_histogram(fill = "purple", breaks = hist_breaks) +
scale_x_continuous(breaks = seq_along(grade_labels), labels = grade_labels) +
labs(title = "Distribution of Grade",
x = "Grade",
y = "Frequency",
caption = generate_figure_caption("Histogram of Distribution of Grade", section = 5))
The bar plot above displays the distribution of the
grade_category variable, showing the frequency of different
grade categories.
Understanding how continuous variables correlate with each other and,
more importantly, with the target variable price.
# Correlation Matrix of Numeric Variables
cor_matrix <- cor(train_df_non_linear[sapply(train_df_non_linear, is.numeric)])
# Create a table of sorted correlation values
cor_table <- as.data.frame(sort(cor_matrix[,"price"], decreasing = TRUE))
# Display the top 20 correlation values
top_20_corr <- cor_table[1:20, , drop = FALSE]
| Variable | Correlation with Price |
|---|---|
| price | 1.0000000 |
| sqft_living | 0.7020794 |
| sqft_above | 0.5998325 |
| sqft_living15 | 0.5931358 |
| bathrooms | 0.5202595 |
| grade_Above_Average | 0.4714246 |
| sqft_basement | 0.3307377 |
| lat | 0.3122509 |
| bedrooms | 0.3051678 |
| view_4 | 0.2997715 |
| zipcode_98004 | 0.2748744 |
| floors | 0.2569215 |
| waterfront_1 | 0.2455379 |
| zipcode_98040 | 0.2082191 |
| view_3 | 0.1908930 |
| zipcode_98112 | 0.1810434 |
| zipcode_98039 | 0.1711855 |
| view_2 | 0.1472079 |
| zipcode_98006 | 0.1363141 |
| yr_renovated | 0.1276562 |
In the tables presented above, we’ve showcased the top 20 correlation
values concerning the target variable, price, with the
values sorted by their absolute magnitudes. Here are some crucial
observations from this analysis:
sqft_living, sqft_above,
sqft_living15, and bathrooms exhibit robust
positive correlations with the target variable (price).
This implies that as these features increase, house prices tend to
increase correspondingly.grade_11_13, view_4, and
grade_8_10 also demonstrate positive correlations,
indicating that properties with higher grades and better views tend to
command higher prices.sqft_living and grade_11_13 emerge as strong
predictors of price.zipcode_98004,
zipcode_98039, and zipcode_98040, also exhibit
noteworthy positive correlations, underscoring the significance of
location in price determination.# Heatmap of the top 20 correlation values
# Filter the top 20 correlation values
top_20_corr_variables <- rownames(top_20_corr)
top_20_corr_matrix <- cor_matrix[top_20_corr_variables, top_20_corr_variables]
# Create a heatmap
ggplot(melt(top_20_corr_matrix), aes(Var1, Var2, fill = value)) +
geom_tile() +
labs(title = "Top 20 Correlations",
x = "Variable",
y = "Variable",
caption = generate_figure_caption("Heatmap showing the top 20 correlations", section = 5)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Selecting predictors and excluding the response variable 'price'
predictors <- dplyr::select(train_df_linear, -price)
# Convert factors to numeric
numeric_predictors <- predictors %>%
mutate(across(where(is.factor), as.numeric)) %>%
mutate(across(where(is.character), ~as.numeric(as.factor(.))))
# Calculate the correlation matrix
corr_matrix <- cor(numeric_predictors, use = "pairwise.complete.obs")
# Convert the correlation matrix to a long format
correlated_pairs_df <- melt(corr_matrix)
# Filter out redundant pairs (keep only lower triangle of the matrix)
correlated_pairs_df <- correlated_pairs_df %>%
filter(Var1 != Var2) %>% # Remove self-correlations
filter(abs(value) > 0.8) %>%
filter(match(Var1, rownames(corr_matrix)) < match(Var2, rownames(corr_matrix)))
# Rename columns for clarity
correlated_pairs_df <- correlated_pairs_df %>%
rename(Variable1 = Var1, Variable2 = Var2, Correlation = value)
# Output the table of highly correlated pairs using knitr::kable()
knitr_table <- kable(
correlated_pairs_df,
caption = generate_table_caption("Highly Correlated Variable Pairs", section = 5),
format = "markdown"
)
print(knitr_table)
| Variable1 | Variable2 | Correlation |
|---|---|---|
| sqft_living | sqft_above | 0.8744114 |
| month_sold | week_of_year | 0.9955447 |
| month_sold | day_of_year | 0.9958281 |
| week_of_year | day_of_year | 0.9996951 |
| condition_3 | condition_4 | -0.8095157 |
| grade_Above_Average | grade_Average | -0.9954905 |
sqft_above & sqft_livingThe removal of sqft_above and sqft_living
is justified due to their high correlation coefficient of 0.8744114.
sqft_above represents the square footage of the living area
above ground, while sqft_living encompasses the total
square footage of living space. Since sqft_above is a
subset of sqft_living, it is likely to contain redundant
information, making it less valuable for our model.
month_sold & week_of_yearThe variables month_sold and week_of_year
exhibit a remarkably high correlation coefficient of 0.9955447. These
variables are intrinsically correlated as they both pertain to the date
of the house sale. While day_of_year provides the most
detailed temporal information, retaining both week_of_year
and month_sold may lead to multicollinearity issues. It’s
advisable to consider removing one of these variables to mitigate
multicollinearity while preserving the most granular date-related
information.
month_sold & day_of_yearSimilar to the previous case, month_sold and
day_of_year demonstrate a high correlation coefficient of
0.9958281. Both variables are related to the date of the house sale.
Given that day_of_year provides the most granular temporal
information, it may be preferred to retain it while considering the
removal of month_sold to address multicollinearity
concerns.
week_of_year &
day_of_yearThe correlation coefficient of 0.9996951 between
week_of_year and day_of_year indicates an
extremely high correlation. Both variables are associated with the date
of sale. Given the granularity of day_of_year, retaining it
and potentially removing week_of_year can be a strategy to
reduce multicollinearity while retaining essential date-related
information.
condition_4 & condition_3The variables condition_4 and condition_3
display a notable negative correlation coefficient of -0.8095157. These
variables are derived from the categorical variable indicating the
condition of the house. Through one-hot encoding, binary variables were
created for each condition. Since these conditions are mutually
exclusive, they exhibit a negative correlation. Consideration can be
given to keeping one condition as a reference group and discarding the
other, or reverting to using the original categorical variable to
effectively capture overall house condition.
grade_Above_Average &
grade_AverageThe correlation coefficient of -0.9954905 between
grade_Above_Average and grade_Average
highlights a strong negative correlation. These variables represent
different grade categories of houses. Such a high correlation suggests
that retaining both variables may introduce multicollinearity into the
model. Decisions can be made to keep one of these variables as a
representative of house grade or explore alternative encoding
strategies.
By addressing the removal of these highly correlated variable pairs, our primary goal is to mitigate multicollinearity issues. Multicollinearity can distort regression coefficient estimates, inflate standard errors, and potentially obscure the statistical significance of predictors. The objective is to retain variables that provide unique and informative contributions to the model’s prediction of house prices.
Analyzing the influence of time-related features such as
month, and season on house prices.
# Monthly Trends in Average House Prices
monthly_trends <- aggregate(train_df_non_linear$price, by = list(train_df_non_linear$month_sold), FUN = mean)
colnames(monthly_trends) <- c("Month", "Average_Price")
# Find the global maximum
global_max <- monthly_trends[which.max(monthly_trends$Average_Price), ]
ggplot(monthly_trends, aes(x = Month, y = Average_Price)) +
geom_line() +
geom_text(data = global_max, aes(x = Month, y = Average_Price, label = paste("Global Max:", round(Average_Price, 2))), vjust = -0.5) +
labs(title = "Monthly Trends in Average House Prices", x = "Month", y = "Average Price", caption = generate_figure_caption("Monthly Trends in Average House Prices", section = 5)) +
scale_x_continuous(breaks = 1:12, labels = month.name) +
theme_minimal()
# Monthly Trends in Count of Homes Sold
monthly_counts <- table(train_df_non_linear$month_sold)
months <- factor(1:12, labels = month.name)
monthly_counts_df <- data.frame(Month = months, Count = as.numeric(monthly_counts))
# Find the global maximum
global_max_count <- monthly_counts_df[which.max(monthly_counts_df$Count), ]
ggplot(monthly_counts_df, aes(x = Month, y = Count, group = 1)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count, aes(x = Month, y = Count, label = paste("Global Max:", Count)), vjust = -0.5) + # Add label for global maximum
labs(title = "Monthly Trends in Count of Homes Sold", x = "Month", y = "Count of Homes Sold", caption = generate_figure_caption("Monthly Trends in Count of Homes Sold", section = 5)) +
theme_minimal()
# Aggregate average price for each season
seasonal_trends <- data.frame(
Season = c("Winter", "Spring", "Summer", "Fall"),
Average_Price = c(
mean(train_df_non_linear$price[train_df_non_linear$season_Winter == 1]),
mean(train_df_non_linear$price[train_df_non_linear$season_Spring == 1]),
mean(train_df_non_linear$price[train_df_non_linear$season_Summer == 1]),
mean(train_df_non_linear$price[train_df_non_linear$season_Fall == 1])
)
)
# Find the global maximum
global_max_seasonal <- seasonal_trends[which.max(seasonal_trends$Average_Price), ]
# Plotting
ggplot(seasonal_trends, aes(x = Season, y = Average_Price, fill = Season)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_seasonal, aes(label = paste("Global Max:", round(Average_Price, 2)), y = Average_Price), vjust = -0.5) +
labs(title = "Seasonal Trends in Average House Prices", x = "Season", y = "Average Price", caption = generate_figure_caption("Seasonal Trends in Average House Prices", section = 5)) +
theme_minimal()
# Count homes sold for each season
seasonal_counts <- c(
sum(train_df_non_linear$season_Winter == 1),
sum(train_df_non_linear$season_Spring == 1),
sum(train_df_non_linear$season_Summer == 1),
sum(train_df_non_linear$season_Fall == 1)
)
seasonal_counts_df <- data.frame(Season = c("Winter", "Spring", "Summer", "Fall"), Count = seasonal_counts)
# Find the global maximum
global_max_count_seasonal <- seasonal_counts_df[which.max(seasonal_counts_df$Count), ]
# Plotting
ggplot(seasonal_counts_df, aes(x = Season, y = Count, fill = Season)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count_seasonal, aes(label = paste("Global Max:", Count), y = Count), vjust = -0.5) +
labs(title = "Seasonal Trends in Count of Homes Sold", x = "Season", y = "Count of Homes Sold", caption = generate_figure_caption("Seasonal Trends in Count of Homes Sold", section = 5)) +
theme_minimal()
# Week of the Year Trends in Average House Prices
weekly_trends <- aggregate(train_df_non_linear$price, by = list(train_df_non_linear$week_of_year), FUN = mean)
colnames(weekly_trends) <- c("Week_of_Year", "Average_Price")
# Find the global maximum
global_max_weekly <- weekly_trends[which.max(weekly_trends$Average_Price), ]
ggplot(weekly_trends, aes(x = Week_of_Year, y = Average_Price)) +
geom_line() +
geom_text(data = global_max_weekly, aes(x = Week_of_Year, y = Average_Price, label = paste("Global Max:", round(Average_Price, 2))), vjust = -0.5) + # Add label for global maximum
labs(title = "Weekly Trends in Average House Prices", x = "Week of Year", y = "Average Price", caption = generate_figure_caption("Weekly Trends in Average House Prices", section = 5)) +
theme_minimal()
# Weekly Trends in Count of Homes Sold
weekly_counts <- table(train_df_non_linear$week_of_year)
weekly_counts_df <- data.frame(Week_of_Year = as.numeric(names(weekly_counts)), Count = as.numeric(weekly_counts))
# Find the global maximum
global_max_count_weekly <- weekly_counts_df[which.max(weekly_counts_df$Count), ]
ggplot(weekly_counts_df, aes(x = Week_of_Year, y = Count)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count_weekly, aes(x = Week_of_Year, y = Count, label = paste("Global Max:", Count)), vjust = -0.5) + # Add label for global maximum
labs(title = "Weekly Trends in Count of Homes Sold", x = "Week of Year", y = "Count of Homes Sold", caption = generate_figure_caption("Weekly Trends in Count of Homes Sold", section = 5)) +
theme_minimal()
# Day of the Year Trends in Average House Prices
daily_trends <- aggregate(train_df_non_linear$price, by = list(train_df_non_linear$day_of_year), FUN = mean)
colnames(daily_trends) <- c("Day_of_Year", "Average_Price")
# Find the global maximum
global_max_daily <- daily_trends[which.max(daily_trends$Average_Price), ]
ggplot(daily_trends, aes(x = Day_of_Year, y = Average_Price)) +
geom_line() +
geom_text(data = global_max_daily
, aes(x = Day_of_Year, y = Average_Price, label = paste("Global Max:", round(Average_Price, 2))), vjust = -0.5) + # Add label for global maximum
labs(title = "Daily Trends in Average House Prices", x = "Day of Year", y = "Average Price", caption = generate_figure_caption("Daily Trends in Average House Prices", section = 5)) +
theme_minimal()
# Daily Trends in Count of Homes Sold
daily_counts <- table(train_df_non_linear$day_of_year)
daily_counts_df <- data.frame(Day_of_Year = as.numeric(names(daily_counts)), Count = as.numeric(daily_counts))
# Find the global maximum
global_max_count_daily <- daily_counts_df[which.max(daily_counts_df$Count), ]
ggplot(daily_counts_df, aes(x = Day_of_Year, y = Count)) +
geom_bar(stat = "identity") +
geom_text(data = global_max_count_daily, aes(x = Day_of_Year, y = Count, label = paste("Global Max:", Count)), vjust = -0.5) + # Add label for global maximum
labs(title = "Daily Trends in Count of Homes Sold", x = "Day of Year", y = "Count of Homes Sold", caption = generate_figure_caption("Daily Trends in Count of Homes Sold", section = 5)) +
theme_minimal()
Investigating the spatial aspect by analyzing the
distance_to_convergence variable.
# Calculate z-scores for the prices
train_df_non_linear <- train_df_non_linear %>%
mutate(z_score = scale(price))
# Define z-score intervals and corresponding colors
z_score_intervals <- seq(-3, 3, by = 1) # Create a sequence of z-scores from -3 to 3
color_sequence <- c("green", "#8fd744", "#fde725", "#f76818ff", "#d7301fff", "#440154") # From green to dark color
# Calculate price at each z-score interval
price_at_intervals <- sapply(z_score_intervals, function(z) {
mean(train_df_non_linear$price) + z * sd(train_df_non_linear$price)
})
# Ensure breaks are in ascending order and rounded to the nearest 25k
breaks <- sort(round(price_at_intervals / 25000) * 25000)
breaks <- c(min(train_df_non_linear$price, na.rm = TRUE), breaks, max(train_df_non_linear$price, na.rm = TRUE))
# If there are negative values or values that don't make sense, remove them
breaks <- breaks[breaks >= 0]
# Create color palette with a color for each interval
color_palette <- colorBin(color_sequence, domain = train_df_non_linear$price, bins = breaks, na.color = "#808080")
# Initialize the leaflet map with updated color palette
m <- leaflet(train_df_non_linear) %>%
addTiles() %>%
addCircleMarkers(
lat = ~lat, lng = ~long,
color = ~color_palette(price),
fillColor = ~color_palette(price),
fillOpacity = 0.8,
radius = 1, # Small dots
popup = ~paste("Price: $", formatC(price, format = "f", big.mark = ","), "<br>", "Z-Score: ", round(z_score, 2))
)
# Define the maximum distance for the distance bands
max_distance <- max(train_df_non_linear$distance_to_convergence, na.rm = TRUE)
# Add distance bands to the map
for (i in seq(2, max_distance, by = 2)) {
m <- addCircles(m, lat = convergence_point[1], lng = convergence_point[2], radius = i * 1000,
color = "grey", weight = 1, fill = FALSE, dashArray = "5, 5")
}
# Add legend and finalize the map
m <- m %>%
addLegend(
position = "bottomright",
pal = color_palette,
values = ~price,
title = "Price",
labFormat = labelFormat(prefix = "$"),
opacity = 1
) %>%
setView(lng = convergence_point[2], lat = convergence_point[1], zoom = 10)
cat(generate_figure_caption('Distance to Convergence Map', section = 5))
Figure 5.38 Distance to Convergence Map
This detailed review of the King County house sales dataset underscores the thorough preparation undertaken for the predictive analysis. The dataset’s diverse variables, both continuous and categorical, have been meticulously processed and analyzed, providing a robust foundation for developing the predictive model. With the comprehensive EDA and graphical analysis, we gain valuable insights into the correlations and distributions within the data, setting the stage for effective model building and accurate house price prediction.
# Drop columns created for visualizations in prior steps, and columns that have almost perfect correlation and multicollinearity.
train_df_non_linear <- train_df_non_linear[, !colnames(train_df_non_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
train_df_linear <- train_df_linear[, !colnames(train_df_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
test_df_linear <- test_df_linear[, !colnames(test_df_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
test_df_non_linear <- test_df_non_linear[, !colnames(test_df_non_linear) %in% c("view_category", "condition_category", "grade_category", "grade_category_numeric", "z_score", "lat", "long", 'sqft_above', 'month_sold', 'week_of_year', 'condition_3', "grade_Below_Average", "bedrooms_factor", "bathrooms_factor")]
# Rebuild linear regression model before performing stepwise
linear_model_initial <- lm(price ~ ., data = train_df_linear)
# Add new coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = linear_model_initial,
model_name = "OLS w/o corr",
coefficients_df = coefficients_df
)
# Define a function to fetch dropped features from models and create a kable table
get_dropped_features_table <- function(model_backward, model_forward, best_linear_model, train_df_linear) {
# Create a data frame to store the results
results_df <- data.frame(Stepwise_Method = character(), Dropped_Features = character(), stringsAsFactors = FALSE)
# All possible features (excluding the response variable 'price')
all_possible_features <- setdiff(colnames(train_df_linear), "price")
# Function to find the dropped features for a given model
get_dropped_features <- function(model, all_features) {
included_features <- names(coef(model))
dropped_features <- setdiff(all_features, included_features)
return(paste(dropped_features, collapse = ", "))
}
# Get the dropped features for each model
dropped_backward <- get_dropped_features(model_backward, all_possible_features)
dropped_forward <- get_dropped_features(model_forward, all_possible_features)
dropped_both <- get_dropped_features(best_linear_model, all_possible_features)
# Add results to the data frame
results_df <- rbind(results_df, data.frame(Stepwise_Method = "OLS_Step_Backward", Dropped_Features = dropped_backward))
results_df <- rbind(results_df, data.frame(Stepwise_Method = "OLS_Step_Forward", Dropped_Features = dropped_forward))
results_df <- rbind(results_df, data.frame(Stepwise_Method = "OLS_Step_Both", Dropped_Features = dropped_both))
# Create the kable table
table <- kable(
results_df,
format = "html",
caption = generate_table_caption("Dropped Features", section = 6),
col.names = c("Stepwise Method", "Dropped Features")
) %>%
kable_styling(full_width = TRUE)
return(table)
}
# Conditional logic based on the update_model_parameters flag
if (update_model_parameters) {
# Run each of the stepwise regression models and update the JSON file
# Step model both
best_linear_model <- ols_step_both_p(linear_model_initial, pent=0.35, prem=0.05)
features_both <- setdiff(names(coef(best_linear_model$model)), "(Intercept)")
# Create a formula with the selected features
formula <- reformulate(features_forward, response = "price")
# Refit the model with the selected features
best_linear_model <- lm(formula, data = train_df_linear)
# Update JSON
update_model_json("OLS_Step_Both", features_both, json_filepath)
# Step model Backward
model_backward <- ols_step_backward_p(linear_model_initial, pent=0.35, prem = 0.05)
features_backward <- setdiff(names(coef(model_backward$model)), "(Intercept)")
# Create a formula with the selected features
formula <- reformulate(features_forward, response = "price")
# Refit the model with the selected features
model_backward <- lm(formula, data = train_df_linear)
# Update JSON
update_model_json("OLS_Step_Backward", features_backward, json_filepath)
# Step model foward
model_forward <- ols_step_forward_p(linear_model_initial, pent=0.35, penter = 0.05)
features_forward <- setdiff(names(coef(model_forward$model)), "(Intercept)")
# Create a formula with the selected features
formula <- reformulate(features_forward, response = "price")
# Refit the model with the selected features
model_forward <- lm(formula, data = train_df_linear)
# Update JSON
update_model_json("OLS_Step_Forward", features_forward, json_filepath)
# Load model parameters from JSON and build models
model_params <- fromJSON(json_filepath)
# List of features for each model from model_params
features_both <- model_params$OLS_Step_Both
features_backward <- model_params$OLS_Step_Backward
features_forward <- model_params$OLS_Step_Forward
# Create a temporary dataframe for each model with the selected features
train_df_both <- train_df_linear[, c("price", features_both)]
test_df_both <- test_df_linear[, c("price", features_both)]
train_df_backward <- train_df_linear[, c("price", features_backward)]
test_df_backward <- test_df_linear[, c("price", features_backward)]
train_df_forward <- train_df_linear[, c("price", features_forward)]
test_df_forward <- test_df_linear[, c("price", features_forward)]
# Fit the linear models using the temporary dataframes
best_linear_model <- lm(price ~ ., data = train_df_both)
model_backward <- lm(price ~ ., data = train_df_backward)
model_forward <- lm(price ~ ., data = train_df_forward)
} else {
# Load model parameters from JSON and build models
model_params <- fromJSON(json_filepath)
# Create models based on the loaded features
if (all(c("OLS_Step_Both", "OLS_Step_Backward", "OLS_Step_Forward") %in% names(model_params))) {
# For each model type, create the model using the features stored in the JSON
# List of features for each model from model_params
features_both <- model_params$OLS_Step_Both
features_backward <- model_params$OLS_Step_Backward
features_forward <- model_params$OLS_Step_Forward
# Create a temporary dataframe for each model with the selected features
train_df_both <- train_df_linear[, c("price", features_both)]
test_df_both <- test_df_linear[, c("price", features_both)]
train_df_backward <- train_df_linear[, c("price", features_backward)]
test_df_backward <- test_df_linear[, c("price", features_backward)]
train_df_forward <- train_df_linear[, c("price", features_forward)]
test_df_forward <- test_df_linear[, c("price", features_forward)]
# Fit the linear models using the temporary dataframes
best_linear_model <- lm(price ~ ., data = train_df_both)
model_backward <- lm(price ~ ., data = train_df_backward)
model_forward <- lm(price ~ ., data = train_df_forward)
} else {
stop("Required model parameters are missing in the JSON file.")
}
}
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = best_linear_model,
model_name = "Inital Step Both",
coefficients_df = coefficients_df
)
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = model_forward,
model_name = "Step Forward",
coefficients_df = coefficients_df
)
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = model_backward,
model_name = "Step Backward",
coefficients_df = coefficients_df
)
# Evaluate OLS_Step_Both
df_results <- evaluate_model("OLS_Step_Both", best_linear_model, train_df_both, test_df_both, target_var = 'price', df_results)
# Evaluate OLS_Step_Backward
df_results <- evaluate_model("OLS_Step_Backward", model_backward, train_df_backward, test_df_backward, target_var = 'price', df_results)
# Evaluate OLS_Step_Forward
df_results <- evaluate_model("OLS_Step_Forward", model_forward, train_df_forward, test_df_forward, target_var = 'price', df_results)
# featch all the dropped features from each of the models
dropped_features_table <- get_dropped_features_table(model_backward, model_forward, best_linear_model, train_df_linear)
| Stepwise Method | Dropped Features |
|---|---|
| OLS_Step_Backward | yr_built, sqft_lot15, condition_2, grade_Average, season_Summer |
| OLS_Step_Forward | sqft_lot15, day_sold, condition_2, season_Fall |
| OLS_Step_Both | yr_built, sqft_lot15, day_sold, zipcode_98010, zipcode_98022, zipcode_98029, zipcode_98033, zipcode_98045, zipcode_98115, zipcode_98116, zipcode_98122, zipcode_98136, zipcode_98177, condition_2, grade_Above_Average, season_Fall |
# Display model results
view_model_results(df_results, generate_table_caption("Step Model Additions", section = 6))
summary(best_linear_model)
##
## Call:
## lm(formula = price ~ ., data = train_df_both)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1416431 -64248 1685 59901 4234646
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.013e+08 1.030e+07 -9.832 < 2e-16 ***
## bathrooms 2.292e+04 2.798e+03 8.193 2.74e-16 ***
## sqft_living 2.115e+02 3.283e+00 64.427 < 2e-16 ***
## sqft_basement -7.342e+01 4.120e+00 -17.822 < 2e-16 ***
## sqft_living15 3.255e+01 3.246e+00 10.026 < 2e-16 ***
## grade_Average -3.520e+05 9.697e+03 -36.298 < 2e-16 ***
## distance_to_convergence -9.300e+03 2.496e+02 -37.260 < 2e-16 ***
## waterfront_0 -5.190e+05 1.905e+04 -27.239 < 2e-16 ***
## zipcode_98004 3.622e+05 1.176e+04 30.789 < 2e-16 ***
## zipcode_98112 3.059e+05 1.232e+04 24.828 < 2e-16 ***
## zipcode_98039 6.904e+05 2.738e+04 25.214 < 2e-16 ***
## view_4 3.414e+05 1.341e+04 25.462 < 2e-16 ***
## bedrooms -2.159e+04 1.730e+03 -12.480 < 2e-16 ***
## zipcode_98040 1.517e+05 1.221e+04 12.427 < 2e-16 ***
## view_3 1.742e+05 8.979e+03 19.405 < 2e-16 ***
## zipcode_98119 1.919e+05 1.357e+04 14.147 < 2e-16 ***
## zipcode_98105 1.466e+05 1.306e+04 11.227 < 2e-16 ***
## zipcode_98109 1.983e+05 1.793e+04 11.059 < 2e-16 ***
## zipcode_98178 -2.705e+05 1.218e+04 -22.209 < 2e-16 ***
## zipcode_98102 2.024e+05 1.739e+04 11.635 < 2e-16 ***
## zipcode_98117 3.302e+04 8.681e+03 3.804 0.000143 ***
## zipcode_98103 2.928e+04 8.533e+03 3.432 0.000602 ***
## view_2 7.160e+04 6.400e+03 11.187 < 2e-16 ***
## zipcode_98107 4.996e+04 1.195e+04 4.180 2.93e-05 ***
## zipcode_98059 -2.278e+05 9.289e+03 -24.523 < 2e-16 ***
## year_sold 5.087e+04 5.113e+03 9.950 < 2e-16 ***
## condition_5 7.211e+04 4.940e+03 14.595 < 2e-16 ***
## zipcode_98056 -2.288e+05 9.768e+03 -23.426 < 2e-16 ***
## zipcode_98058 -2.232e+05 9.472e+03 -23.563 < 2e-16 ***
## zipcode_98168 -2.283e+05 1.208e+04 -18.903 < 2e-16 ***
## zipcode_98055 -2.272e+05 1.174e+04 -19.354 < 2e-16 ***
## zipcode_98118 -1.782e+05 9.168e+03 -19.437 < 2e-16 ***
## zipcode_98108 -2.072e+05 1.369e+04 -15.135 < 2e-16 ***
## sqft_lot 2.496e-01 3.583e-02 6.968 3.36e-12 ***
## zipcode_98106 -1.759e+05 1.054e+04 -16.688 < 2e-16 ***
## zipcode_98146 -1.885e+05 1.174e+04 -16.060 < 2e-16 ***
## yr_renovated 3.162e+01 3.213e+00 9.842 < 2e-16 ***
## zipcode_98031 -2.069e+05 1.178e+04 -17.568 < 2e-16 ***
## zipcode_98188 -2.298e+05 1.676e+04 -13.710 < 2e-16 ***
## condition_4 2.516e+04 3.164e+03 7.952 1.96e-15 ***
## view_1 7.131e+04 1.036e+04 6.881 6.18e-12 ***
## zipcode_98166 -1.885e+05 1.187e+04 -15.883 < 2e-16 ***
## zipcode_98198 -1.984e+05 1.223e+04 -16.224 < 2e-16 ***
## zipcode_98028 -1.686e+05 1.199e+04 -14.057 < 2e-16 ***
## floors -3.155e+04 3.434e+03 -9.189 < 2e-16 ***
## zipcode_98042 -1.703e+05 9.138e+03 -18.633 < 2e-16 ***
## zipcode_98030 -1.787e+05 1.241e+04 -14.404 < 2e-16 ***
## zipcode_98011 -1.672e+05 1.400e+04 -11.942 < 2e-16 ***
## zipcode_98070 -1.860e+05 1.801e+04 -10.323 < 2e-16 ***
## zipcode_98032 -1.829e+05 1.741e+04 -10.508 < 2e-16 ***
## zipcode_98155 -1.328e+05 9.443e+03 -14.063 < 2e-16 ***
## zipcode_98034 -1.205e+05 8.865e+03 -13.591 < 2e-16 ***
## season_Spring 2.006e+04 3.411e+03 5.880 4.18e-09 ***
## day_of_year 1.131e+02 2.457e+01 4.602 4.22e-06 ***
## zipcode_98077 -1.495e+05 1.432e+04 -10.440 < 2e-16 ***
## zipcode_98148 -1.798e+05 2.276e+04 -7.903 2.91e-15 ***
## season_Summer 1.058e+04 3.606e+03 2.933 0.003360 **
## zipcode_98125 -1.101e+05 9.904e+03 -11.113 < 2e-16 ***
## zipcode_98072 -1.236e+05 1.204e+04 -10.266 < 2e-16 ***
## zipcode_98133 -1.086e+05 9.027e+03 -12.027 < 2e-16 ***
## zipcode_98074 -1.079e+05 9.617e+03 -11.215 < 2e-16 ***
## zipcode_98019 -1.343e+05 1.377e+04 -9.757 < 2e-16 ***
## zipcode_98092 -1.431e+05 1.121e+04 -12.770 < 2e-16 ***
## zipcode_98038 -1.273e+05 8.995e+03 -14.157 < 2e-16 ***
## zipcode_98023 -1.284e+05 1.023e+04 -12.550 < 2e-16 ***
## zipcode_98001 -1.229e+05 1.097e+04 -11.204 < 2e-16 ***
## zipcode_98003 -1.222e+05 1.254e+04 -9.749 < 2e-16 ***
## zipcode_98002 -1.138e+05 1.401e+04 -8.124 4.88e-16 ***
## zipcode_98065 -9.718e+04 1.142e+04 -8.511 < 2e-16 ***
## zipcode_98126 -9.718e+04 1.065e+04 -9.123 < 2e-16 ***
## zipcode_98027 -9.378e+04 9.795e+03 -9.574 < 2e-16 ***
## zipcode_98075 -9.893e+04 1.064e+04 -9.294 < 2e-16 ***
## zipcode_98053 -8.840e+04 1.030e+04 -8.584 < 2e-16 ***
## zipcode_98052 -8.713e+04 8.528e+03 -10.217 < 2e-16 ***
## zipcode_98008 -9.692e+04 1.194e+04 -8.120 5.03e-16 ***
## zipcode_98006 -7.846e+04 9.535e+03 -8.228 < 2e-16 ***
## zipcode_98144 -8.192e+04 1.072e+04 -7.642 2.27e-14 ***
## zipcode_98007 -1.053e+05 1.632e+04 -6.453 1.13e-10 ***
## zipcode_98005 -7.554e+04 1.489e+04 -5.073 3.96e-07 ***
## zipcode_98014 -7.135e+04 1.788e+04 -3.992 6.59e-05 ***
## zipcode_98024 -7.353e+04 2.154e+04 -3.414 0.000642 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 155500 on 15048 degrees of freedom
## Multiple R-squared: 0.8161, Adjusted R-squared: 0.8152
## F-statistic: 835 on 80 and 15048 DF, p-value: < 2.2e-16
In our analysis of different regression models, we evaluated four models: OLS_linear, OLS_Step_Both, OLS_Step_Backward, and OLS_Step_Forward. These models were assessed based on several key metrics, including the sum of squared errors (SSE) for both the training and testing datasets, the coefficient of determination (R-squared) for both training and testing, root mean square error (RMSE) for training and testing, and mean absolute error (MAE) for training and testing.
The OLS_Step_Both model emerged as a compelling choice due to its unique characteristics. While it exhibited a slight reduction in model performance compared to the OLS_linear model, it showcased a distinctive feature selection process. OLS_Step_Both effectively removes a substantial number of features, making it the least complex model among the four alternatives. This feature reduction enhances model interpretability and simplicity, which can be particularly valuable in scenarios where we seek to understand the most influential variables while maintaining competitive predictive power.
The trade-off between complexity and performance makes OLS_Step_Both an attractive option for specific use cases. If the primary goal is to build a model that strikes a balance between simplicity and accuracy, the OLS_Step_Both model offers a pragmatic solution. However, after comprehensive consideration, we ultimately select the OLS_linear model as the preferred choice for our regression analysis. It delivers strong overall performance across various metrics, making it a robust and versatile model for our dataset and problem.
The residuals vs fitted plot is our first stop to assess the assumption of linearity. The ideal scenario is a random spread of residuals around the horizontal axis, indicating a linear relationship between predictors and the response.
Figure 7.1 Residuals vs Fitted Plot for Linear
Model
We use the ols_plot_resid_lev function to create a plot
for detecting outliers and observations with high leverage.
Figure 7.2 OLS Outlier and Leverage Diagnostics Linear
Model
We use the ols_plot_resid_stud_fit function to create a
plot that helps detect non-linearity, constant variances, and outliers
in residuals.
Figure 7.3 Deleted Studentized Resid. vs Pred. Linear
Model
The Q-Q plot offers a visual comparison of the distribution of residuals against a perfectly normal distribution. Deviations from the diagonal indicate departures from normality.
Figure 7.4 QQ Plot for Linear
Model
Also known as the spread-location plot, it’s used to check for equal variance of residuals (homoscedasticity).
Figure 7.5 Scale-Location Plot for Linear
Model
The Cook’s distance plot is instrumental in quantifying the influence of each data point.
Figure 7.6 Cooks Distance Plot for Linear
Model
Figure 7.7 OLS Cooks Distance Plot for Linear
Model
We filter the data points with Cook’s distance >= 0.04, which is a reasonable threshold for identifying influential data points according to industry standards.
# Define the threshold for Cook's distance, opting for slightly lower than .04 as it benefits the model performance to do a little bit lower
threshold <- 0.04
# Identify the indices of influential observations
influential_obs <- which(cooksd > threshold)
# Output what observations were influential prior to removal
print(influential_obs)
## 5127 9986 13244
## 5127 9986 13244
# Remove outliers from both datasets
train_df_both <- train_df_both[-influential_obs, ]
# re-fit the linear model without outliers
best_linear_model <- lm(price ~ ., data = train_df_both)
summary(best_linear_model)
##
## Call:
## lm(formula = price ~ ., data = train_df_both)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1359328 -64137 589 58324 2106190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.686e+07 9.896e+06 -9.787 < 2e-16 ***
## bathrooms 2.180e+04 2.687e+03 8.112 5.36e-16 ***
## sqft_living 2.013e+02 3.167e+00 63.583 < 2e-16 ***
## sqft_basement -7.199e+01 3.957e+00 -18.194 < 2e-16 ***
## sqft_living15 3.946e+01 3.123e+00 12.633 < 2e-16 ***
## grade_Average -3.391e+05 9.321e+03 -36.386 < 2e-16 ***
## distance_to_convergence -9.304e+03 2.397e+02 -38.821 < 2e-16 ***
## waterfront_0 -5.162e+05 1.837e+04 -28.104 < 2e-16 ***
## zipcode_98004 3.563e+05 1.131e+04 31.490 < 2e-16 ***
## zipcode_98112 3.073e+05 1.183e+04 25.980 < 2e-16 ***
## zipcode_98039 6.988e+05 2.629e+04 26.578 < 2e-16 ***
## view_4 3.298e+05 1.291e+04 25.541 < 2e-16 ***
## bedrooms -1.939e+04 1.663e+03 -11.662 < 2e-16 ***
## zipcode_98040 1.455e+05 1.174e+04 12.386 < 2e-16 ***
## view_3 1.666e+05 8.630e+03 19.303 < 2e-16 ***
## zipcode_98119 1.916e+05 1.303e+04 14.705 < 2e-16 ***
## zipcode_98105 1.460e+05 1.254e+04 11.640 < 2e-16 ***
## zipcode_98109 1.978e+05 1.722e+04 11.487 < 2e-16 ***
## zipcode_98178 -2.717e+05 1.170e+04 -23.230 < 2e-16 ***
## zipcode_98102 1.514e+05 1.679e+04 9.014 < 2e-16 ***
## zipcode_98117 3.255e+04 8.336e+03 3.905 9.48e-05 ***
## zipcode_98103 2.717e+04 8.193e+03 3.316 0.000914 ***
## view_2 7.466e+04 6.146e+03 12.147 < 2e-16 ***
## zipcode_98107 4.726e+04 1.148e+04 4.118 3.84e-05 ***
## zipcode_98059 -2.278e+05 8.919e+03 -25.543 < 2e-16 ***
## year_sold 4.866e+04 4.911e+03 9.908 < 2e-16 ***
## condition_5 7.279e+04 4.744e+03 15.343 < 2e-16 ***
## zipcode_98056 -2.300e+05 9.379e+03 -24.517 < 2e-16 ***
## zipcode_98058 -2.238e+05 9.095e+03 -24.606 < 2e-16 ***
## zipcode_98168 -2.305e+05 1.160e+04 -19.879 < 2e-16 ***
## zipcode_98055 -2.289e+05 1.127e+04 -20.308 < 2e-16 ***
## zipcode_98118 -1.793e+05 8.803e+03 -20.367 < 2e-16 ***
## zipcode_98108 -2.086e+05 1.315e+04 -15.867 < 2e-16 ***
## sqft_lot 2.718e-01 3.441e-02 7.901 2.96e-15 ***
## zipcode_98106 -1.778e+05 1.012e+04 -17.566 < 2e-16 ***
## zipcode_98146 -1.886e+05 1.127e+04 -16.735 < 2e-16 ***
## yr_renovated 2.986e+01 3.087e+00 9.675 < 2e-16 ***
## zipcode_98031 -2.081e+05 1.131e+04 -18.400 < 2e-16 ***
## zipcode_98188 -2.304e+05 1.609e+04 -14.316 < 2e-16 ***
## condition_4 2.317e+04 3.039e+03 7.624 2.60e-14 ***
## view_1 7.444e+04 9.951e+03 7.481 7.81e-14 ***
## zipcode_98166 -1.863e+05 1.140e+04 -16.345 < 2e-16 ***
## zipcode_98198 -1.980e+05 1.174e+04 -16.866 < 2e-16 ***
## zipcode_98028 -1.690e+05 1.151e+04 -14.682 < 2e-16 ***
## floors -2.844e+04 3.299e+03 -8.623 < 2e-16 ***
## zipcode_98042 -1.714e+05 8.775e+03 -19.531 < 2e-16 ***
## zipcode_98030 -1.803e+05 1.191e+04 -15.131 < 2e-16 ***
## zipcode_98011 -1.681e+05 1.345e+04 -12.501 < 2e-16 ***
## zipcode_98070 -1.843e+05 1.730e+04 -10.654 < 2e-16 ***
## zipcode_98032 -1.834e+05 1.672e+04 -10.969 < 2e-16 ***
## zipcode_98155 -1.336e+05 9.067e+03 -14.731 < 2e-16 ***
## zipcode_98034 -1.210e+05 8.513e+03 -14.211 < 2e-16 ***
## season_Spring 1.924e+04 3.276e+03 5.872 4.40e-09 ***
## day_of_year 1.042e+02 2.359e+01 4.418 1.00e-05 ***
## zipcode_98077 -1.476e+05 1.375e+04 -10.737 < 2e-16 ***
## zipcode_98148 -1.814e+05 2.185e+04 -8.304 < 2e-16 ***
## season_Summer 1.060e+04 3.463e+03 3.061 0.002207 **
## zipcode_98125 -1.111e+05 9.510e+03 -11.683 < 2e-16 ***
## zipcode_98072 -1.233e+05 1.156e+04 -10.665 < 2e-16 ***
## zipcode_98133 -1.105e+05 8.668e+03 -12.745 < 2e-16 ***
## zipcode_98074 -1.070e+05 9.235e+03 -11.585 < 2e-16 ***
## zipcode_98019 -1.357e+05 1.322e+04 -10.269 < 2e-16 ***
## zipcode_98092 -1.437e+05 1.076e+04 -13.354 < 2e-16 ***
## zipcode_98038 -1.287e+05 8.637e+03 -14.902 < 2e-16 ***
## zipcode_98023 -1.288e+05 9.827e+03 -13.105 < 2e-16 ***
## zipcode_98001 -1.242e+05 1.053e+04 -11.790 < 2e-16 ***
## zipcode_98003 -1.228e+05 1.204e+04 -10.203 < 2e-16 ***
## zipcode_98002 -1.143e+05 1.346e+04 -8.495 < 2e-16 ***
## zipcode_98065 -9.733e+04 1.096e+04 -8.877 < 2e-16 ***
## zipcode_98126 -9.825e+04 1.023e+04 -9.606 < 2e-16 ***
## zipcode_98027 -9.325e+04 9.405e+03 -9.915 < 2e-16 ***
## zipcode_98075 -9.728e+04 1.022e+04 -9.517 < 2e-16 ***
## zipcode_98053 -8.705e+04 9.889e+03 -8.803 < 2e-16 ***
## zipcode_98052 -8.725e+04 8.189e+03 -10.654 < 2e-16 ***
## zipcode_98008 -9.622e+04 1.146e+04 -8.394 < 2e-16 ***
## zipcode_98006 -7.440e+04 9.156e+03 -8.126 4.80e-16 ***
## zipcode_98144 -8.200e+04 1.029e+04 -7.966 1.75e-15 ***
## zipcode_98007 -1.060e+05 1.567e+04 -6.767 1.36e-11 ***
## zipcode_98005 -7.296e+04 1.430e+04 -5.103 3.38e-07 ***
## zipcode_98014 -7.122e+04 1.716e+04 -4.149 3.36e-05 ***
## zipcode_98024 -7.325e+04 2.068e+04 -3.542 0.000399 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 149300 on 15045 degrees of freedom
## Multiple R-squared: 0.8218, Adjusted R-squared: 0.8208
## F-statistic: 867.1 on 80 and 15045 DF, p-value: < 2.2e-16
df_results <- evaluate_model("OLS_Step_Both_Outliers", best_linear_model, train_df_both, test_df_both, target_var = 'price', df_results)
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = best_linear_model,
model_name = "OLS Both outliers",
coefficients_df = coefficients_df
)
Figure 8.1 Box-Cox Plot for Linear
Model
As we can observe from the box cox plot above, the lambda that give the highest log-likehood is 0.0909091. We transform Y using the log transformation as shown below.
# Function to transform a single value back to the original scale
inverse_transform <- function(value, lambda=optimal_lambda) {
if (lambda != 0) {
return((value * lambda + 1)^(1 / lambda))
} else {
return(exp(value))
}
}
# Function to calculate SSE, RMSE, MAE, and SST with or without inverse transformation
calculate_metrics <- function(actual, predicted, lambda = optimal_lambda) {
if (lambda != 0) {
# Apply inverse transformation to actual and predicted values
actual_original <- inverse_transform(actual, lambda)
predicted_original <- inverse_transform(predicted, lambda)
# Calculate SSE, RMSE, MAE, and SST
sse <- sum((actual_original - predicted_original)^2)
rmse <- sqrt(sse / length(actual_original))
mae <- mean(abs(actual_original - predicted_original))
sst <- sum((actual_original - mean(actual_original))^2)
} else {
# Calculate SSE, RMSE, MAE, and SST without transformation
sse <- sum((actual - predicted)^2)
rmse <- sqrt(sse / length(actual))
mae <- mean(abs(actual - predicted))
sst <- sum((actual - mean(actual))^2)
}
return(list(sse = sse, rmse = rmse, mae = mae, sst = sst))
}
# Apply the Box-Cox transformation to the training and test data
if (optimal_lambda != 0) {
transformed_price <- (train_df_both$price^optimal_lambda - 1) / optimal_lambda
transformed_test_price <- (test_df_both$price^optimal_lambda - 1) / optimal_lambda
} else {
transformed_price <- log(train_df_both$price)
transformed_test_price <- log(test_df_both$price)
}
# Create a linear regression model using the transformed price
transformed_model <- lm(transformed_price ~ ., data = dplyr::select(train_df_both, -price))
summary(transformed_model)
##
## Call:
## lm(formula = transformed_price ~ ., data = dplyr::select(train_df_both,
## -price))
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4702 -0.3274 0.0294 0.3550 3.3390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.602e+02 4.112e+01 -13.625 < 2e-16 ***
## bathrooms 1.680e-01 1.116e-02 15.050 < 2e-16 ***
## sqft_living 8.984e-04 1.316e-05 68.287 < 2e-16 ***
## sqft_basement -3.453e-04 1.644e-05 -21.001 < 2e-16 ***
## sqft_living15 3.761e-04 1.298e-05 28.982 < 2e-16 ***
## grade_Average -9.368e-02 3.873e-02 -2.419 0.0156 *
## distance_to_convergence -5.969e-02 9.958e-04 -59.947 < 2e-16 ***
## waterfront_0 -1.508e+00 7.631e-02 -19.763 < 2e-16 ***
## zipcode_98004 7.198e-01 4.701e-02 15.313 < 2e-16 ***
## zipcode_98112 7.708e-01 4.915e-02 15.683 < 2e-16 ***
## zipcode_98039 1.247e+00 1.092e-01 11.417 < 2e-16 ***
## view_4 1.024e+00 5.365e-02 19.096 < 2e-16 ***
## bedrooms -3.523e-02 6.909e-03 -5.099 3.45e-07 ***
## zipcode_98040 1.054e-01 4.880e-02 2.160 0.0308 *
## view_3 6.726e-01 3.585e-02 18.760 < 2e-16 ***
## zipcode_98119 8.285e-01 5.413e-02 15.306 < 2e-16 ***
## zipcode_98105 4.431e-01 5.210e-02 8.505 < 2e-16 ***
## zipcode_98109 7.553e-01 7.153e-02 10.560 < 2e-16 ***
## zipcode_98178 -2.002e+00 4.859e-02 -41.209 < 2e-16 ***
## zipcode_98102 5.634e-01 6.977e-02 8.075 7.24e-16 ***
## zipcode_98117 2.955e-01 3.463e-02 8.532 < 2e-16 ***
## zipcode_98103 1.709e-01 3.404e-02 5.021 5.21e-07 ***
## view_2 3.922e-01 2.554e-02 15.358 < 2e-16 ***
## zipcode_98107 3.184e-01 4.768e-02 6.678 2.51e-11 ***
## zipcode_98059 -1.286e+00 3.706e-02 -34.708 < 2e-16 ***
## year_sold 2.907e-01 2.040e-02 14.247 < 2e-16 ***
## condition_5 3.668e-01 1.971e-02 18.611 < 2e-16 ***
## zipcode_98056 -1.489e+00 3.897e-02 -38.218 < 2e-16 ***
## zipcode_98058 -1.493e+00 3.779e-02 -39.498 < 2e-16 ***
## zipcode_98168 -2.042e+00 4.818e-02 -42.384 < 2e-16 ***
## zipcode_98055 -1.747e+00 4.683e-02 -37.295 < 2e-16 ***
## zipcode_98118 -1.245e+00 3.657e-02 -34.049 < 2e-16 ***
## zipcode_98108 -1.427e+00 5.462e-02 -26.123 < 2e-16 ***
## sqft_lot 2.376e-06 1.430e-07 16.621 < 2e-16 ***
## zipcode_98106 -1.422e+00 4.204e-02 -33.820 < 2e-16 ***
## zipcode_98146 -1.416e+00 4.682e-02 -30.245 < 2e-16 ***
## yr_renovated 1.358e-04 1.283e-05 10.586 < 2e-16 ***
## zipcode_98031 -1.540e+00 4.698e-02 -32.783 < 2e-16 ***
## zipcode_98188 -1.789e+00 6.687e-02 -26.752 < 2e-16 ***
## condition_4 1.544e-01 1.263e-02 12.230 < 2e-16 ***
## view_1 3.840e-01 4.134e-02 9.287 < 2e-16 ***
## zipcode_98166 -1.037e+00 4.736e-02 -21.891 < 2e-16 ***
## zipcode_98198 -1.446e+00 4.878e-02 -29.642 < 2e-16 ***
## zipcode_98028 -9.154e-01 4.784e-02 -19.135 < 2e-16 ***
## floors -6.739e-02 1.371e-02 -4.917 8.89e-07 ***
## zipcode_98042 -1.298e+00 3.646e-02 -35.605 < 2e-16 ***
## zipcode_98030 -1.407e+00 4.950e-02 -28.423 < 2e-16 ***
## zipcode_98011 -8.618e-01 5.586e-02 -15.427 < 2e-16 ***
## zipcode_98070 -5.711e-01 7.188e-02 -7.945 2.08e-15 ***
## zipcode_98032 -1.569e+00 6.945e-02 -22.584 < 2e-16 ***
## zipcode_98155 -8.571e-01 3.767e-02 -22.752 < 2e-16 ***
## zipcode_98034 -7.224e-01 3.537e-02 -20.425 < 2e-16 ***
## season_Spring 1.247e-01 1.361e-02 9.162 < 2e-16 ***
## day_of_year 6.417e-04 9.803e-05 6.545 6.13e-11 ***
## zipcode_98077 -5.770e-01 5.711e-02 -10.103 < 2e-16 ***
## zipcode_98148 -1.516e+00 9.079e-02 -16.699 < 2e-16 ***
## season_Summer 8.471e-02 1.439e-02 5.887 4.01e-09 ***
## zipcode_98125 -6.053e-01 3.951e-02 -15.320 < 2e-16 ***
## zipcode_98072 -5.819e-01 4.803e-02 -12.117 < 2e-16 ***
## zipcode_98133 -7.318e-01 3.601e-02 -20.320 < 2e-16 ***
## zipcode_98074 -4.657e-01 3.837e-02 -12.138 < 2e-16 ***
## zipcode_98019 -7.618e-01 5.492e-02 -13.872 < 2e-16 ***
## zipcode_98092 -1.011e+00 4.471e-02 -22.616 < 2e-16 ***
## zipcode_98038 -8.612e-01 3.589e-02 -23.997 < 2e-16 ***
## zipcode_98023 -1.088e+00 4.083e-02 -26.660 < 2e-16 ***
## zipcode_98001 -1.144e+00 4.377e-02 -26.129 < 2e-16 ***
## zipcode_98003 -1.021e+00 5.003e-02 -20.403 < 2e-16 ***
## zipcode_98002 -1.293e+00 5.591e-02 -23.134 < 2e-16 ***
## zipcode_98065 -3.761e-01 4.555e-02 -8.256 < 2e-16 ***
## zipcode_98126 -6.307e-01 4.250e-02 -14.841 < 2e-16 ***
## zipcode_98027 -4.180e-01 3.908e-02 -10.698 < 2e-16 ***
## zipcode_98075 -4.456e-01 4.247e-02 -10.494 < 2e-16 ***
## zipcode_98053 -3.637e-01 4.109e-02 -8.852 < 2e-16 ***
## zipcode_98052 -3.983e-01 3.402e-02 -11.707 < 2e-16 ***
## zipcode_98008 -5.176e-01 4.762e-02 -10.869 < 2e-16 ***
## zipcode_98006 -5.343e-01 3.804e-02 -14.044 < 2e-16 ***
## zipcode_98144 -6.792e-01 4.277e-02 -15.881 < 2e-16 ***
## zipcode_98007 -5.803e-01 6.511e-02 -8.913 < 2e-16 ***
## zipcode_98005 -4.024e-01 5.940e-02 -6.774 1.30e-11 ***
## zipcode_98014 -4.854e-01 7.132e-02 -6.806 1.04e-11 ***
## zipcode_98024 -4.948e-01 8.593e-02 -5.759 8.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6204 on 15045 degrees of freedom
## Multiple R-squared: 0.8726, Adjusted R-squared: 0.8719
## F-statistic: 1288 on 80 and 15045 DF, p-value: < 2.2e-16
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = transformed_model,
model_name = "Transformed Model",
coefficients_df = coefficients_df
)
# Predict prices for training and test data using the transformed model
predicted_train_transformed <- predict(transformed_model, newdata = train_df_both)
predicted_test_transformed <- predict(transformed_model, newdata = test_df_both)
# Calculate other metrics for training and test data
metrics_train <- calculate_metrics(transformed_price, predicted_train_transformed, optimal_lambda)
metrics_test <- calculate_metrics(transformed_test_price, predicted_test_transformed, optimal_lambda)
# Calculate SSE for the training and test data without inverse transformation
sse_train <- sum((transformed_price - predicted_train_transformed)^2)
sse_test <- sum((transformed_test_price - predicted_test_transformed)^2)
# Calculate R-squared for training and test data without inverse transformation
sst_train <- sum((transformed_price - mean(transformed_price))^2)
sst_test <- sum((transformed_test_price - mean(transformed_test_price))^2)
r_squared_train <- 1 - (sse_train / sst_train)
r_squared_test <- 1 - (sse_test / sst_test)
# Create a dataframe to store the results
df_results <- rbind(df_results, data.frame(
Model = "Transformed Model",
SSE_train = metrics_train$sse,
SSE_test = metrics_test$sse,
R_squared_train = r_squared_train,
R_squared_test = r_squared_test,
RMSE_train = metrics_train$rmse,
RMSE_test = metrics_test$rmse,
MAE_train = metrics_train$mae,
MAE_test = metrics_test$mae
))
# View the updated model results
view_model_results(df_results, caption = generate_table_caption("Transformation Model Addition", section = 8))
Figure 8.2 Diagnostic Plots after
Transformation
Figure 8.3 Boxplot After
Transformation
### 8.3 Detection of Multicollinearity
In this section, we perform a Variance Inflation Factor (VIF)-based multicollinearity analysis. Multicollinearity refers to the situation where predictor variables in a regression model are highly correlated with each other, which can lead to instability in coefficient estimates and difficulties in interpreting the model. VIF helps us identify and mitigate multicollinearity by quantifying how much the variance of the estimated coefficients is increased due to the correlation between predictors.
vif_values <- vif(transformed_model)
vif_values
## bathrooms sqft_living sqft_basement
## 2.885919 5.591630 2.078644
## sqft_living15 grade_Average distance_to_convergence
## 3.129896 1.287715 3.586258
## waterfront_0 zipcode_98004 zipcode_98112
## 1.532997 1.239202 1.140794
## zipcode_98039 view_4 bedrooms
## 1.051926 1.555985 1.654887
## zipcode_98040 view_3 zipcode_98119
## 1.178869 1.116631 1.070897
## zipcode_98105 zipcode_98109 zipcode_98178
## 1.095893 1.044862 1.085291
## zipcode_98102 zipcode_98117 zipcode_98103
## 1.056638 1.157612 1.218270
## view_2 zipcode_98107 zipcode_98059
## 1.097466 1.096697 1.158714
## year_sold condition_5 zipcode_98056
## 3.579772 1.129758 1.133790
## zipcode_98058 zipcode_98168 zipcode_98055
## 1.133719 1.084810 1.074864
## zipcode_98118 zipcode_98108 sqft_lot
## 1.188458 1.075240 1.223590
## zipcode_98106 zipcode_98146 yr_renovated
## 1.116055 1.085421 1.049119
## zipcode_98031 zipcode_98188 condition_4
## 1.104029 1.039494 1.214259
## view_1 zipcode_98166 zipcode_98198
## 1.049111 1.087726 1.111711
## zipcode_98028 floors zipcode_98042
## 1.075043 2.148960 1.292638
## zipcode_98030 zipcode_98011 zipcode_98070
## 1.126009 1.061104 1.121497
## zipcode_98032 zipcode_98155 zipcode_98034
## 1.059415 1.123381 1.173275
## season_Spring day_of_year zipcode_98077
## 1.573156 3.389715 1.092376
## zipcode_98148 season_Summer zipcode_98125
## 1.024765 1.620946 1.122651
## zipcode_98072 zipcode_98133 zipcode_98074
## 1.089445 1.139479 1.172599
## zipcode_98019 zipcode_98092 zipcode_98038
## 1.079371 1.317431 1.388642
## zipcode_98023 zipcode_98001 zipcode_98003
## 1.439587 1.300638 1.213702
## zipcode_98002 zipcode_98065 zipcode_98126
## 1.174290 1.195131 1.108418
## zipcode_98027 zipcode_98075 zipcode_98053
## 1.128614 1.183813 1.163209
## zipcode_98052 zipcode_98008 zipcode_98006
## 1.225477 1.122828 1.242755
## zipcode_98144 zipcode_98007 zipcode_98005
## 1.163978 1.072503 1.118598
## zipcode_98014 zipcode_98024
## 1.090945 1.051387
We would typically define a VIF threshold (in this case, 10), which serves as a criterion to identify predictor variables with high VIF values, however, there are not any VIF’s above this threshold meaning we do not have issues of multicollinearity.
# Breusch-Pagan Test
bp_test_results <- bptest(transformed_model, studentize = FALSE)
# Output the results of the Breusch-Pagan test
bp_test_results
##
## Breusch-Pagan test
##
## data: transformed_model
## BP = 2482.9, df = 80, p-value < 2.2e-16
In our analysis, we conducted a Breusch-Pagan test to check for heteroscedasticity in our linear regression model, referred to as “transformed_model.” Heteroscedasticity refers to the situation where the spread of errors (residuals) in the model varies across different levels of the independent variables.
The test results showed a very low p-value (p < 2.2e-16), indicating strong evidence against the null hypothesis suggesting that heteroscedasticity is present in the residuals of our model. This finding is essential as it implies that the variability of errors is not consistent across all levels of the predictor variables. Addressing heteroscedasticity may require adjustments to improve the model’s reliability and interpretability.
Figure 8.4 Plots Examining
Heteroscedasticity
# Calculate absolute residuals from the transformed model
transformed_model_residuals <- abs(transformed_model$residuals)
# Fit a linear model on the absolute residuals of the transformed model
residuals_model <- lm(transformed_model_residuals ~ ., data = train_df_both[, -1])
# Retrieve fitted values from the residuals model
fitted_values_residuals <- residuals_model$fitted.values
# Calculate weights as the inverse square of the fitted values
weights_iteration_1 <- 1 / (fitted_values_residuals ^ 2)
# Refit the transformed model with the new weights
transformed_model_weighted <- lm(transformed_price ~ ., weights = weights_iteration_1, data = train_df_both[, -1])
# Add coefficients to check if heteroscedascticity is resolved later
coefficients_df <- create_coefficients_df(
model = transformed_model_weighted,
model_name = "WT Model",
coefficients_df = coefficients_df
)
Figure 8.5 Plots Post WLS on Transformed
Model
Figure 8.6 Boxplot for WLS on Transformed
Model
# Perform Breusch-Pagan test to check for heteroscedasticity
bptest_result <- bptest(transformed_model_weighted, studentize = TRUE)
bptest_result
##
## studentized Breusch-Pagan test
##
## data: transformed_model_weighted
## BP = 72140, df = 80, p-value < 2.2e-16
# Iterate to improve the model based on Breusch-Pagan test results
iteration <- 1
#TODO change back to 200
while (iteration < 1 && bptest_result$p.value < 0.05) {
# Calculate new absolute residuals
abs_residuals_new <- abs(transformed_model_weighted$residuals)
# Fit a new linear model on the new absolute residuals
residuals_model_new <- lm(abs_residuals_new ~ ., data = train_df_both[, -1])
# Retrieve new fitted values
fitted_values_residuals_new <- residuals_model_new$fitted.values
# Calculate new weights
weights_new <- 1 / (fitted_values_residuals_new ^ 2)
# Refit the transformed model with the new weights
transformed_model_weighted <- lm(log(price) ~ ., weights = weights_new, data = train_df_both)
# Update the Breusch-Pagan test result
bptest_result <- bptest(transformed_model_weighted, studentize = TRUE)
# Increment iteration counter
iteration <- iteration + 1
}
# Extract coefficients from the final weighted least squares model
coefficients_wls_final <- transformed_model_weighted$coefficients
# Summary and diagnostic plots of the final model
summary(transformed_model_weighted)
##
## Call:
## lm(formula = transformed_price ~ ., data = train_df_both[, -1],
## weights = weights_iteration_1)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -11.1168 -0.7413 0.0663 0.8179 9.0178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.886e+02 3.701e+01 -13.200 < 2e-16 ***
## bathrooms 1.621e-01 1.032e-02 15.709 < 2e-16 ***
## sqft_living 8.637e-04 1.270e-05 68.031 < 2e-16 ***
## sqft_basement -3.412e-04 1.571e-05 -21.722 < 2e-16 ***
## sqft_living15 3.749e-04 1.237e-05 30.305 < 2e-16 ***
## grade_Average -1.131e-01 5.060e-02 -2.236 0.025372 *
## distance_to_convergence -5.833e-02 9.581e-04 -60.885 < 2e-16 ***
## waterfront_0 -1.561e+00 1.097e-01 -14.231 < 2e-16 ***
## zipcode_98004 7.323e-01 4.899e-02 14.949 < 2e-16 ***
## zipcode_98112 8.170e-01 6.414e-02 12.738 < 2e-16 ***
## zipcode_98039 1.280e+00 9.303e-02 13.759 < 2e-16 ***
## view_4 1.058e+00 7.609e-02 13.902 < 2e-16 ***
## bedrooms -1.122e-02 5.294e-03 -2.119 0.034079 *
## zipcode_98040 1.781e-01 5.009e-02 3.557 0.000377 ***
## view_3 6.483e-01 4.664e-02 13.900 < 2e-16 ***
## zipcode_98119 8.048e-01 6.242e-02 12.892 < 2e-16 ***
## zipcode_98105 4.359e-01 4.903e-02 8.890 < 2e-16 ***
## zipcode_98109 7.348e-01 8.680e-02 8.465 < 2e-16 ***
## zipcode_98178 -2.003e+00 4.516e-02 -44.339 < 2e-16 ***
## zipcode_98102 5.448e-01 8.192e-02 6.650 3.03e-11 ***
## zipcode_98117 2.987e-01 3.377e-02 8.844 < 2e-16 ***
## zipcode_98103 1.752e-01 4.109e-02 4.263 2.03e-05 ***
## view_2 4.000e-01 2.685e-02 14.897 < 2e-16 ***
## zipcode_98107 2.952e-01 3.574e-02 8.260 < 2e-16 ***
## zipcode_98059 -1.283e+00 3.100e-02 -41.370 < 2e-16 ***
## year_sold 2.551e-01 1.837e-02 13.892 < 2e-16 ***
## condition_5 3.476e-01 1.721e-02 20.197 < 2e-16 ***
## zipcode_98056 -1.482e+00 4.058e-02 -36.509 < 2e-16 ***
## zipcode_98058 -1.486e+00 3.347e-02 -44.413 < 2e-16 ***
## zipcode_98168 -2.053e+00 4.996e-02 -41.092 < 2e-16 ***
## zipcode_98055 -1.740e+00 4.868e-02 -35.745 < 2e-16 ***
## zipcode_98118 -1.257e+00 5.429e-02 -23.154 < 2e-16 ***
## zipcode_98108 -1.412e+00 5.473e-02 -25.807 < 2e-16 ***
## sqft_lot 3.569e-06 2.230e-07 16.005 < 2e-16 ***
## zipcode_98106 -1.416e+00 4.661e-02 -30.380 < 2e-16 ***
## zipcode_98146 -1.442e+00 7.070e-02 -20.401 < 2e-16 ***
## yr_renovated 1.270e-04 1.523e-05 8.335 < 2e-16 ***
## zipcode_98031 -1.541e+00 3.009e-02 -51.226 < 2e-16 ***
## zipcode_98188 -1.792e+00 5.826e-02 -30.766 < 2e-16 ***
## condition_4 1.243e-01 1.111e-02 11.187 < 2e-16 ***
## view_1 3.750e-01 4.404e-02 8.517 < 2e-16 ***
## zipcode_98166 -1.047e+00 5.420e-02 -19.326 < 2e-16 ***
## zipcode_98198 -1.460e+00 5.283e-02 -27.641 < 2e-16 ***
## zipcode_98028 -8.996e-01 3.942e-02 -22.824 < 2e-16 ***
## floors -6.296e-02 1.249e-02 -5.041 4.69e-07 ***
## zipcode_98042 -1.331e+00 3.149e-02 -42.266 < 2e-16 ***
## zipcode_98030 -1.423e+00 3.792e-02 -37.525 < 2e-16 ***
## zipcode_98011 -8.225e-01 4.192e-02 -19.621 < 2e-16 ***
## zipcode_98070 -6.188e-01 9.087e-02 -6.811 1.01e-11 ***
## zipcode_98032 -1.552e+00 5.920e-02 -26.207 < 2e-16 ***
## zipcode_98155 -8.494e-01 3.097e-02 -27.424 < 2e-16 ***
## zipcode_98034 -7.414e-01 2.878e-02 -25.764 < 2e-16 ***
## season_Spring 1.150e-01 1.247e-02 9.223 < 2e-16 ***
## day_of_year 5.163e-04 8.779e-05 5.881 4.17e-09 ***
## zipcode_98077 -5.476e-01 4.521e-02 -12.111 < 2e-16 ***
## zipcode_98148 -1.519e+00 1.021e-01 -14.883 < 2e-16 ***
## season_Summer 6.911e-02 1.258e-02 5.495 3.98e-08 ***
## zipcode_98125 -6.019e-01 3.774e-02 -15.950 < 2e-16 ***
## zipcode_98072 -5.898e-01 3.938e-02 -14.978 < 2e-16 ***
## zipcode_98133 -7.432e-01 3.198e-02 -23.240 < 2e-16 ***
## zipcode_98074 -4.180e-01 3.089e-02 -13.529 < 2e-16 ***
## zipcode_98019 -7.876e-01 4.184e-02 -18.825 < 2e-16 ***
## zipcode_98092 -1.065e+00 3.783e-02 -28.156 < 2e-16 ***
## zipcode_98038 -8.903e-01 2.767e-02 -32.169 < 2e-16 ***
## zipcode_98023 -1.085e+00 3.234e-02 -33.556 < 2e-16 ***
## zipcode_98001 -1.182e+00 3.816e-02 -30.984 < 2e-16 ***
## zipcode_98003 -1.024e+00 4.166e-02 -24.573 < 2e-16 ***
## zipcode_98002 -1.307e+00 4.349e-02 -30.061 < 2e-16 ***
## zipcode_98065 -3.322e-01 3.298e-02 -10.073 < 2e-16 ***
## zipcode_98126 -6.259e-01 4.484e-02 -13.960 < 2e-16 ***
## zipcode_98027 -3.878e-01 3.562e-02 -10.888 < 2e-16 ***
## zipcode_98075 -3.524e-01 3.212e-02 -10.971 < 2e-16 ***
## zipcode_98053 -3.111e-01 3.884e-02 -8.010 1.23e-15 ***
## zipcode_98052 -3.719e-01 2.578e-02 -14.427 < 2e-16 ***
## zipcode_98008 -5.042e-01 3.855e-02 -13.079 < 2e-16 ***
## zipcode_98006 -4.749e-01 3.750e-02 -12.663 < 2e-16 ***
## zipcode_98144 -6.877e-01 4.581e-02 -15.011 < 2e-16 ***
## zipcode_98007 -5.496e-01 4.620e-02 -11.895 < 2e-16 ***
## zipcode_98005 -3.329e-01 5.388e-02 -6.179 6.61e-10 ***
## zipcode_98014 -3.996e-01 9.629e-02 -4.149 3.35e-05 ***
## zipcode_98024 -5.133e-01 8.479e-02 -6.053 1.45e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.326 on 15045 degrees of freedom
## Multiple R-squared: 0.8769, Adjusted R-squared: 0.8762
## F-statistic: 1340 on 80 and 15045 DF, p-value: < 2.2e-16
# Evaluate model performance on the training dataset
predicted_train <- predict(transformed_model_weighted, newdata = train_df_both)
MAE_train <- mae(train_df_both$price, predicted_train)
SSE_train <- sum((train_df_both$price - predicted_train) ^ 2)
R_squared_train <- R2(train_df_both$price, predicted_train)
RMSE_train <- rmse(train_df_both$price, predicted_train)
# Evaluate model performance on the test dataset
predicted_test <- predict(transformed_model_weighted, newdata = test_df_both)
MAE_test <- mae(test_df_both$price, predicted_test)
SSE_test <- sum((test_df_both$price - predicted_test) ^ 2)
R_squared_test <- R2(test_df_both$price, predicted_test)
RMSE_test <- rmse(test_df_both$price, predicted_test)
# Add the performance metrics of the transformed model to the df_results dataframe
df_results <- rbind(df_results, data.frame(
Model = "WT Model 200",
SSE_train = SSE_train,
SSE_test = SSE_test,
R_squared_train = R_squared_train,
R_squared_test = R_squared_test,
RMSE_train = RMSE_train,
RMSE_test = RMSE_test,
MAE_train = MAE_train,
MAE_test = MAE_test
))
summary(transformed_model_weighted)
##
## Call:
## lm(formula = transformed_price ~ ., data = train_df_both[, -1],
## weights = weights_iteration_1)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -11.1168 -0.7413 0.0663 0.8179 9.0178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.886e+02 3.701e+01 -13.200 < 2e-16 ***
## bathrooms 1.621e-01 1.032e-02 15.709 < 2e-16 ***
## sqft_living 8.637e-04 1.270e-05 68.031 < 2e-16 ***
## sqft_basement -3.412e-04 1.571e-05 -21.722 < 2e-16 ***
## sqft_living15 3.749e-04 1.237e-05 30.305 < 2e-16 ***
## grade_Average -1.131e-01 5.060e-02 -2.236 0.025372 *
## distance_to_convergence -5.833e-02 9.581e-04 -60.885 < 2e-16 ***
## waterfront_0 -1.561e+00 1.097e-01 -14.231 < 2e-16 ***
## zipcode_98004 7.323e-01 4.899e-02 14.949 < 2e-16 ***
## zipcode_98112 8.170e-01 6.414e-02 12.738 < 2e-16 ***
## zipcode_98039 1.280e+00 9.303e-02 13.759 < 2e-16 ***
## view_4 1.058e+00 7.609e-02 13.902 < 2e-16 ***
## bedrooms -1.122e-02 5.294e-03 -2.119 0.034079 *
## zipcode_98040 1.781e-01 5.009e-02 3.557 0.000377 ***
## view_3 6.483e-01 4.664e-02 13.900 < 2e-16 ***
## zipcode_98119 8.048e-01 6.242e-02 12.892 < 2e-16 ***
## zipcode_98105 4.359e-01 4.903e-02 8.890 < 2e-16 ***
## zipcode_98109 7.348e-01 8.680e-02 8.465 < 2e-16 ***
## zipcode_98178 -2.003e+00 4.516e-02 -44.339 < 2e-16 ***
## zipcode_98102 5.448e-01 8.192e-02 6.650 3.03e-11 ***
## zipcode_98117 2.987e-01 3.377e-02 8.844 < 2e-16 ***
## zipcode_98103 1.752e-01 4.109e-02 4.263 2.03e-05 ***
## view_2 4.000e-01 2.685e-02 14.897 < 2e-16 ***
## zipcode_98107 2.952e-01 3.574e-02 8.260 < 2e-16 ***
## zipcode_98059 -1.283e+00 3.100e-02 -41.370 < 2e-16 ***
## year_sold 2.551e-01 1.837e-02 13.892 < 2e-16 ***
## condition_5 3.476e-01 1.721e-02 20.197 < 2e-16 ***
## zipcode_98056 -1.482e+00 4.058e-02 -36.509 < 2e-16 ***
## zipcode_98058 -1.486e+00 3.347e-02 -44.413 < 2e-16 ***
## zipcode_98168 -2.053e+00 4.996e-02 -41.092 < 2e-16 ***
## zipcode_98055 -1.740e+00 4.868e-02 -35.745 < 2e-16 ***
## zipcode_98118 -1.257e+00 5.429e-02 -23.154 < 2e-16 ***
## zipcode_98108 -1.412e+00 5.473e-02 -25.807 < 2e-16 ***
## sqft_lot 3.569e-06 2.230e-07 16.005 < 2e-16 ***
## zipcode_98106 -1.416e+00 4.661e-02 -30.380 < 2e-16 ***
## zipcode_98146 -1.442e+00 7.070e-02 -20.401 < 2e-16 ***
## yr_renovated 1.270e-04 1.523e-05 8.335 < 2e-16 ***
## zipcode_98031 -1.541e+00 3.009e-02 -51.226 < 2e-16 ***
## zipcode_98188 -1.792e+00 5.826e-02 -30.766 < 2e-16 ***
## condition_4 1.243e-01 1.111e-02 11.187 < 2e-16 ***
## view_1 3.750e-01 4.404e-02 8.517 < 2e-16 ***
## zipcode_98166 -1.047e+00 5.420e-02 -19.326 < 2e-16 ***
## zipcode_98198 -1.460e+00 5.283e-02 -27.641 < 2e-16 ***
## zipcode_98028 -8.996e-01 3.942e-02 -22.824 < 2e-16 ***
## floors -6.296e-02 1.249e-02 -5.041 4.69e-07 ***
## zipcode_98042 -1.331e+00 3.149e-02 -42.266 < 2e-16 ***
## zipcode_98030 -1.423e+00 3.792e-02 -37.525 < 2e-16 ***
## zipcode_98011 -8.225e-01 4.192e-02 -19.621 < 2e-16 ***
## zipcode_98070 -6.188e-01 9.087e-02 -6.811 1.01e-11 ***
## zipcode_98032 -1.552e+00 5.920e-02 -26.207 < 2e-16 ***
## zipcode_98155 -8.494e-01 3.097e-02 -27.424 < 2e-16 ***
## zipcode_98034 -7.414e-01 2.878e-02 -25.764 < 2e-16 ***
## season_Spring 1.150e-01 1.247e-02 9.223 < 2e-16 ***
## day_of_year 5.163e-04 8.779e-05 5.881 4.17e-09 ***
## zipcode_98077 -5.476e-01 4.521e-02 -12.111 < 2e-16 ***
## zipcode_98148 -1.519e+00 1.021e-01 -14.883 < 2e-16 ***
## season_Summer 6.911e-02 1.258e-02 5.495 3.98e-08 ***
## zipcode_98125 -6.019e-01 3.774e-02 -15.950 < 2e-16 ***
## zipcode_98072 -5.898e-01 3.938e-02 -14.978 < 2e-16 ***
## zipcode_98133 -7.432e-01 3.198e-02 -23.240 < 2e-16 ***
## zipcode_98074 -4.180e-01 3.089e-02 -13.529 < 2e-16 ***
## zipcode_98019 -7.876e-01 4.184e-02 -18.825 < 2e-16 ***
## zipcode_98092 -1.065e+00 3.783e-02 -28.156 < 2e-16 ***
## zipcode_98038 -8.903e-01 2.767e-02 -32.169 < 2e-16 ***
## zipcode_98023 -1.085e+00 3.234e-02 -33.556 < 2e-16 ***
## zipcode_98001 -1.182e+00 3.816e-02 -30.984 < 2e-16 ***
## zipcode_98003 -1.024e+00 4.166e-02 -24.573 < 2e-16 ***
## zipcode_98002 -1.307e+00 4.349e-02 -30.061 < 2e-16 ***
## zipcode_98065 -3.322e-01 3.298e-02 -10.073 < 2e-16 ***
## zipcode_98126 -6.259e-01 4.484e-02 -13.960 < 2e-16 ***
## zipcode_98027 -3.878e-01 3.562e-02 -10.888 < 2e-16 ***
## zipcode_98075 -3.524e-01 3.212e-02 -10.971 < 2e-16 ***
## zipcode_98053 -3.111e-01 3.884e-02 -8.010 1.23e-15 ***
## zipcode_98052 -3.719e-01 2.578e-02 -14.427 < 2e-16 ***
## zipcode_98008 -5.042e-01 3.855e-02 -13.079 < 2e-16 ***
## zipcode_98006 -4.749e-01 3.750e-02 -12.663 < 2e-16 ***
## zipcode_98144 -6.877e-01 4.581e-02 -15.011 < 2e-16 ***
## zipcode_98007 -5.496e-01 4.620e-02 -11.895 < 2e-16 ***
## zipcode_98005 -3.329e-01 5.388e-02 -6.179 6.61e-10 ***
## zipcode_98014 -3.996e-01 9.629e-02 -4.149 3.35e-05 ***
## zipcode_98024 -5.133e-01 8.479e-02 -6.053 1.45e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.326 on 15045 degrees of freedom
## Multiple R-squared: 0.8769, Adjusted R-squared: 0.8762
## F-statistic: 1340 on 80 and 15045 DF, p-value: < 2.2e-16
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = transformed_model_weighted,
model_name = "WT Model 200",
coefficients_df = coefficients_df
)
# View the updated model results
view_model_results(df_results, caption = generate_table_caption("Weighted Transformation Model Addition", section = 8))
bptest(transformed_model_weighted, studentize = TRUE)
##
## studentized Breusch-Pagan test
##
## data: transformed_model_weighted
## BP = 72140, df = 80, p-value < 2.2e-16
After performing the initial Breusch-Pagan test to assess
heteroscedasticity on the weighted linear model
(transformed_model_weighted), the test yielded a statistic
of 71397 with 80 degrees of freedom, resulting in a p-value less than
2.2e-16, indicating significant heteroscedasticity in the model. This
initial result indicated that the residuals of the model exhibited a
pattern that violated the assumption of constant variance.
To address this issue, 200 iterations were carried out, where the model was repeatedly re-fitted and the Breusch-Pagan test was performed after each iteration. After these iterations, the Breusch-Pagan test resulted in a significantly larger statistic of 976748 with the same 80 degrees of freedom, and a p-value still less than 2.2e-16. Despite the numerous iterations, the p-value remained extremely low, indicating that heteroscedasticity persisted in the model.
Even after 200 iterations of weighting the linear model in an attempt to address heteroscedasticity, the model continued to exhibit significant heteroscedasticity, as indicated by the persistently low p-value from the Breusch-Pagan test. Further investigation or alternative modeling approaches may be necessary to effectively address this issue.
Figure 8.7 Plots Post WLS on Transformed Model after 200
Iterations
Figure 8.8 Boxplot for WLS on Transformed Model after 200
Iterations
# Fetch the Weighted coefficients to compare results
model_names_to_match <- c("WT Model", "WT Model 200")
# Use the subset function to select rows where Model_Name matches any of the model names in the list
selected_rows <- subset(coefficients_df, Model_Name %in% model_names_to_match)
# Identify columns with NA values in the selected subset of rows
cols_with_na <- colnames(selected_rows)[apply(selected_rows, 2, anyNA)]
# Remove columns with NA values from the selected subset
selected_rows <- selected_rows[, !colnames(selected_rows) %in% cols_with_na]
x <- data.matrix(dplyr::select(train_df_both, -price))
y <- log(train_df_both$price)
# Problem 8.5: Run Ridge Regression
set.seed(1023)
RidgeMod <- glmnet(x, y, alpha = 0, nlambda = 100, lambda.min.ratio = 0.0001)
CvRidgeMod <- cv.glmnet(x, y, alpha = 0, nlambda = 100, lambda.min.ratio = 0.0001)
# Find the best lambda
best.lambda.ridge <- CvRidgeMod$lambda.min
print(paste0("Best Lambda: ", best.lambda.ridge))
## [1] "Best Lambda: 0.0363370475980558"
# Add coefficients to dataframe
coefficients_df <- create_coefficients_df(
model = coefficients(RidgeMod,s=best.lambda.ridge),
model_name = "Ridge Model",
coefficients_df = coefficients_df
)
# Fetch the coefficients to compare results
model_names_to_match <- c("Ridge Model", "WT Model 200")
# Use the subset function to select rows where Model_Name matches any of the model names in the list
selected_rows <- subset(coefficients_df, Model_Name %in% model_names_to_match)
# Identify columns with NA values in the selected subset of rows
cols_with_na <- colnames(selected_rows)[apply(selected_rows, 2, anyNA)]
# Remove columns with NA values from the selected subset
selected_rows <- selected_rows[, !colnames(selected_rows) %in% cols_with_na]
Figure 8.9 Ridge Regression
Plot
LassoMod <- glmnet(x, y, alpha = 1, nlambda = 100, lambda.min.ratio = 0.0001)
CvLassoMod <- cv.glmnet(x, y, alpha = 1, nlambda = 100, lambda.min.ratio = 0.0001)
# Find the best lambda for Lasso
best.lambda.lasso <- CvLassoMod$lambda.min
print(paste0("Best Lambda for Lasso: ", best.lambda.lasso))
## [1] "Best Lambda for Lasso: 0.000176692585065216"
# Add coefficients to dataframe for Lasso
coefficients_df <- create_coefficients_df(
model = coefficients(LassoMod, s = best.lambda.lasso),
model_name = "Lasso Model",
coefficients_df = coefficients_df
)
# Fetch the coefficients to compare results between Lasso and other models
model_names_to_match <- c("Lasso Model", "WT Model 200")
# Use the subset function to select rows where Model_Name matches any of the model names in the list
selected_rows <- subset(coefficients_df, Model_Name %in% model_names_to_match)
# Identify columns with NA values in the selected subset of rows
cols_with_na <- colnames(selected_rows)[apply(selected_rows, 2, anyNA)]
# Remove columns with NA values from the selected subset
selected_rows <- selected_rows[, !colnames(selected_rows) %in% cols_with_na]
Figure 8.10 Lasso Regression
Plot
# Set alpha for Elastic Net
alpha_elastic_net <- 0.5
# Problem 8.5: Run Elastic Net Regression
ElasticNetMod <- glmnet(x, y, alpha = alpha_elastic_net, nlambda = 100, lambda.min.ratio = 0.0001)
CvElasticNetMod <- cv.glmnet(x, y, alpha = alpha_elastic_net, nlambda = 100, lambda.min.ratio = 0.0001)
# Find the best lambda for Elastic Net
best.lambda.elastic_net <- CvElasticNetMod$lambda.min
print(paste0("Best Lambda for Elastic Net: ", best.lambda.elastic_net))
## [1] "Best Lambda for Elastic Net: 0.000321991405586386"
# Add coefficients to dataframe for Elastic Net
coefficients_df <- create_coefficients_df(
model = coefficients(ElasticNetMod, s = best.lambda.elastic_net),
model_name = "EN Model",
coefficients_df = coefficients_df
)
# Fetch the coefficients to compare results between Elastic Net and other models
model_names_to_match <- c("EN Model", "WT Model 200")
# Use the subset function to select rows where Model_Name matches any of the model names in the list
selected_rows <- subset(coefficients_df, Model_Name %in% model_names_to_match)
# Identify columns with NA values in the selected subset of rows
cols_with_na <- colnames(selected_rows)[apply(selected_rows, 2, anyNA)]
# Remove columns with NA values from the selected subset
selected_rows <- selected_rows[, !colnames(selected_rows) %in% cols_with_na]
Figure 8.11 Elastic Net Regression
Plot
# Run Huber Regression using rlm
HuberMod <- rlm(log(price) ~ ., data = train_df_both)
# Add coefficients to dataframe for Huber
coefficients_df <- create_coefficients_df(
model = HuberMod,
model_name = "Huber Model",
coefficients_df = coefficients_df
)
# Fetch the coefficients to compare results between Huber and other models
model_names_to_match <- c("Huber Model", "WT Model 200")
# Use the subset function to select rows where Model_Name matches any of the model names in the list
selected_rows <- subset(coefficients_df, Model_Name %in% model_names_to_match)
# Identify columns with NA values in the selected subset of rows
cols_with_na <- colnames(selected_rows)[apply(selected_rows, 2, anyNA)]
# Remove columns with NA values from the selected subset
selected_rows <- selected_rows[, !colnames(selected_rows) %in% cols_with_na]
Copyright © 2023 Charles and with AI assistance. Fabricated with blood, sweat, and more coffee than a pod of overcaffeinated narwhals. Unauthorized replication of this work will result in a jester performing terribly lame jokes at your next family gathering. Proceed with caution.